Repository: BlueBrain/CoreNeuron
Branch: master
Commit: 3a49f4b85a97
Files: 246
Total size: 1.3 MB

Directory structure:
gitextract_m67oskuu/

├── .bbp-project.yaml
├── .clang-format.changes
├── .cmake-format.changes.yaml
├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug_report.md
│   │   ├── config.yml
│   │   └── feature_request.md
│   ├── problem-matchers/
│   │   ├── address.json
│   │   ├── gcc.json
│   │   └── undefined.json
│   ├── pull_request_template.md
│   └── workflows/
│       ├── clang_cmake_format_check.yaml
│       ├── coreneuron-ci.yml
│       ├── coverage.yml
│       └── test-as-submodule.yml
├── .gitignore
├── .gitlab-ci.yml
├── .gitmodules
├── .readthedocs.yml
├── .sanitizers/
│   └── undefined.supp
├── AUTHORS.txt
├── CMake/
│   ├── AddHpcCodingConvSubmodule.cmake
│   ├── AddMod2cSubmodule.cmake
│   ├── AddNmodlSubmodule.cmake
│   ├── AddRandom123Submodule.cmake
│   ├── CrayPortability.cmake
│   ├── GitRevision.cmake
│   ├── MakefileBuildOptions.cmake
│   ├── OpenAccHelper.cmake
│   ├── TestScriptUtils.cmake
│   ├── config/
│   │   ├── CompilerFlagsHelpers.cmake
│   │   ├── ReleaseDebugAutoFlags.cmake
│   │   ├── SetRpath.cmake
│   │   └── TestHelpers.cmake
│   ├── coreneuron-config.cmake.in
│   └── packages/
│       ├── FindSphinx.cmake
│       ├── Findlikwid.cmake
│       ├── Findnmodl.cmake
│       └── Findreportinglib.cmake
├── CMakeLists.txt
├── LICENSE.txt
├── README.md
├── coreneuron/
│   ├── CMakeLists.txt
│   ├── apps/
│   │   ├── coreneuron.cpp
│   │   ├── corenrn_parameters.cpp
│   │   ├── corenrn_parameters.hpp
│   │   └── main1.cpp
│   ├── config/
│   │   ├── config.cpp.in
│   │   ├── config.h
│   │   ├── neuron_version.hpp.in
│   │   └── version_macros.hpp
│   ├── coreneuron.hpp
│   ├── engine.h.in
│   ├── gpu/
│   │   ├── nrn_acc_manager.cpp
│   │   └── nrn_acc_manager.hpp
│   ├── io/
│   │   ├── core2nrn_data_return.cpp
│   │   ├── core2nrn_data_return.hpp
│   │   ├── file_utils.cpp
│   │   ├── file_utils.hpp
│   │   ├── global_vars.cpp
│   │   ├── lfp.cpp
│   │   ├── lfp.hpp
│   │   ├── mech_report.cpp
│   │   ├── mech_report.h
│   │   ├── mem_layout_util.cpp
│   │   ├── mem_layout_util.hpp
│   │   ├── mk_mech.cpp
│   │   ├── nrn2core_data_init.cpp
│   │   ├── nrn2core_direct.h
│   │   ├── nrn_checkpoint.cpp
│   │   ├── nrn_checkpoint.hpp
│   │   ├── nrn_filehandler.cpp
│   │   ├── nrn_filehandler.hpp
│   │   ├── nrn_setup.cpp
│   │   ├── nrn_setup.hpp
│   │   ├── nrnsection_mapping.hpp
│   │   ├── output_spikes.cpp
│   │   ├── output_spikes.hpp
│   │   ├── phase1.cpp
│   │   ├── phase1.hpp
│   │   ├── phase2.cpp
│   │   ├── phase2.hpp
│   │   ├── prcellstate.cpp
│   │   ├── prcellstate.hpp
│   │   ├── reports/
│   │   │   ├── binary_report_handler.cpp
│   │   │   ├── binary_report_handler.hpp
│   │   │   ├── nrnreport.cpp
│   │   │   ├── nrnreport.hpp
│   │   │   ├── report_configuration_parser.cpp
│   │   │   ├── report_event.cpp
│   │   │   ├── report_event.hpp
│   │   │   ├── report_handler.cpp
│   │   │   ├── report_handler.hpp
│   │   │   ├── sonata_report_handler.cpp
│   │   │   └── sonata_report_handler.hpp
│   │   ├── setup_fornetcon.cpp
│   │   ├── setup_fornetcon.hpp
│   │   └── user_params.hpp
│   ├── mechanism/
│   │   ├── capac.cpp
│   │   ├── eion.cpp
│   │   ├── eion.hpp
│   │   ├── mech/
│   │   │   ├── cfile/
│   │   │   │   └── cabvars.h
│   │   │   ├── enginemech.cpp
│   │   │   ├── mod2c_core_thread.hpp
│   │   │   ├── mod_func.c.pl
│   │   │   └── modfile/
│   │   │       ├── exp2syn.mod
│   │   │       ├── expsyn.mod
│   │   │       ├── hh.mod
│   │   │       ├── netstim.mod
│   │   │       ├── passive.mod
│   │   │       ├── pattern.mod
│   │   │       ├── stim.mod
│   │   │       └── svclmp.mod
│   │   ├── mech_mapping.cpp
│   │   ├── mech_mapping.hpp
│   │   ├── mechanism.hpp
│   │   ├── membfunc.hpp
│   │   ├── patternstim.cpp
│   │   ├── register_mech.cpp
│   │   └── register_mech.hpp
│   ├── membrane_definitions.h
│   ├── mpi/
│   │   ├── core/
│   │   │   ├── nrnmpi.hpp
│   │   │   ├── nrnmpi_def_cinc.cpp
│   │   │   ├── nrnmpidec.cpp
│   │   │   └── resolve.cpp
│   │   ├── lib/
│   │   │   ├── mpispike.cpp
│   │   │   ├── nrnmpi.cpp
│   │   │   └── nrnmpi.hpp
│   │   ├── nrnmpi.h
│   │   ├── nrnmpidec.h
│   │   └── nrnmpiuse.h
│   ├── network/
│   │   ├── cvodestb.cpp
│   │   ├── have2want.h
│   │   ├── multisend.cpp
│   │   ├── multisend.hpp
│   │   ├── multisend_setup.cpp
│   │   ├── netcon.hpp
│   │   ├── netcvode.cpp
│   │   ├── netcvode.hpp
│   │   ├── netpar.cpp
│   │   ├── netpar.hpp
│   │   ├── partrans.cpp
│   │   ├── partrans.hpp
│   │   ├── partrans_setup.cpp
│   │   ├── tnode.hpp
│   │   ├── tqueue.cpp
│   │   ├── tqueue.hpp
│   │   └── tqueue.ipp
│   ├── nrnconf.h
│   ├── nrniv/
│   │   └── nrniv_decl.h
│   ├── nrnoc/
│   │   ├── md1redef.h
│   │   └── md2redef.h
│   ├── permute/
│   │   ├── balance.cpp
│   │   ├── cellorder.cpp
│   │   ├── cellorder.cu
│   │   ├── cellorder.hpp
│   │   ├── cellorder1.cpp
│   │   ├── cellorder2.cpp
│   │   ├── data_layout.cpp
│   │   ├── data_layout.hpp
│   │   ├── node_permute.cpp
│   │   └── node_permute.h
│   ├── sim/
│   │   ├── fadvance_core.cpp
│   │   ├── fast_imem.cpp
│   │   ├── fast_imem.hpp
│   │   ├── finitialize.cpp
│   │   ├── multicore.cpp
│   │   ├── multicore.hpp
│   │   ├── scopmath/
│   │   │   ├── abort.cpp
│   │   │   ├── crout_thread.hpp
│   │   │   ├── errcodes.h
│   │   │   ├── newton_struct.h
│   │   │   ├── newton_thread.cpp
│   │   │   ├── newton_thread.hpp
│   │   │   ├── sparse_thread.hpp
│   │   │   └── ssimplic_thread.hpp
│   │   ├── solve_core.cpp
│   │   └── treeset_core.cpp
│   └── utils/
│       ├── ivocvect.cpp
│       ├── ivocvect.hpp
│       ├── lpt.cpp
│       ├── lpt.hpp
│       ├── memory.cpp
│       ├── memory.h
│       ├── memory_utils.cpp
│       ├── memory_utils.h
│       ├── nrn_assert.h
│       ├── nrn_stats.cpp
│       ├── nrn_stats.h
│       ├── nrnmutdec.hpp
│       ├── nrnoc_aux.cpp
│       ├── nrnoc_aux.hpp
│       ├── nrntimeout.cpp
│       ├── offload.hpp
│       ├── profile/
│       │   └── profiler_interface.h
│       ├── progressbar/
│       │   ├── progressbar.cpp
│       │   └── progressbar.hpp
│       ├── randoms/
│       │   ├── nrnran123.cpp
│       │   └── nrnran123.h
│       ├── string_utils.cpp
│       ├── string_utils.h
│       ├── units.hpp
│       ├── utils.cpp
│       ├── utils.hpp
│       ├── utils_cuda.h
│       ├── vrecitem.h
│       └── vrecord.cpp
├── docs/
│   ├── Doxyfile.in
│   ├── DoxygenLayout.xml
│   ├── README.md
│   ├── _static/
│   │   └── custom.css
│   ├── conda_environment.yml
│   ├── conf.py
│   ├── docs_requirements.txt
│   ├── doxygen.rst
│   ├── footer.html
│   ├── index.rst
│   └── userdoc/
│       ├── BinaryFormat/
│       │   └── BinaryFormat.md
│       └── MemoryManagement/
│           └── bbcorepointer.md
├── extra/
│   ├── CMakeLists.txt
│   ├── instrumentation.tau
│   ├── nrnivmodl-core.in
│   └── nrnivmodl_core_makefile.in
└── tests/
    ├── CMakeLists.txt
    ├── integration/
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── integration_test.sh.in
    │   ├── reportinglib/
    │   │   ├── 1.check.in
    │   │   ├── 1.conf.in
    │   │   ├── 1.report
    │   │   ├── reporting_test.sh.in
    │   │   └── test_ref.out
    │   ├── ring/
    │   │   └── out.dat.ref
    │   └── ring_gap/
    │       ├── mod files/
    │       │   └── halfgap.mod
    │       └── out.dat.ref
    └── unit/
        ├── alignment/
        │   ├── CMakeLists.txt
        │   └── alignment.cpp
        ├── cmdline_interface/
        │   ├── CMakeLists.txt
        │   └── test_cmdline_interface.cpp
        ├── interleave_info/
        │   ├── CMakeLists.txt
        │   └── check_constructors.cpp
        ├── lfp/
        │   ├── CMakeLists.txt
        │   └── lfp.cpp
        ├── queueing/
        │   ├── CMakeLists.txt
        │   └── test_queueing.cpp
        └── solver/
            ├── CMakeLists.txt
            └── test_solver.cpp

================================================
FILE CONTENTS
================================================

================================================
FILE: .bbp-project.yaml
================================================
tools:
  ClangFormat:
    enable: True
    include:
      match:
      - coreneuron/.*\.((cu)|(h)|([chi]pp))$
  CMakeFormat:
    enable: True


================================================
FILE: .clang-format.changes
================================================
IndentCaseLabels: true
SortIncludes: false
StatementMacros: [nrn_pragma_acc, nrn_pragma_omp]


================================================
FILE: .cmake-format.changes.yaml
================================================
additional_commands:
  cpp_cc_build_time_copy:
    flags: ['NO_TARGET']
    kwargs:
      INPUT: '1'
      OUTPUT: '1'


================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.md
================================================
---
name: Bug report
about: Create a report to help us improve
title: ''
labels: ''
assignees: ''

---

**Describe the issue**
A clear and concise description of what the issue is.

**To Reproduce**
Steps to reproduce the behavior:
```bash
A simple script
```

**Expected behavior**
A clear and concise description of what you expected to happen.

**Logs**
If possible attach helpful logs related to the issue.
If there is an issue during build `CMakeError.log`, `CMakeOutput.log` or the output of `make VERBOSE=1` would be helpful.
Otherwise any error printed to the therminal.

**System (please complete the following information)**
 - OS: [e.g. Ubuntu 20.04]
 - Compiler: [e.g. PGI 20.9]
 - Version: [e.g. master branch]
 - Backend: [e.g. CPU]

**Additional context**
Add any other context about the problem here.


================================================
FILE: .github/ISSUE_TEMPLATE/config.yml
================================================
blank_issues_enabled: true


================================================
FILE: .github/ISSUE_TEMPLATE/feature_request.md
================================================
---
name: Feature request
about: Suggest an idea for this project
title: ''
labels: ''
assignees: ''

---

**Is your feature request related to a problem? Please describe.**
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]

**Describe the solution you'd like**
A clear and concise description of what you want to happen.

**Describe alternatives you've considered**
A clear and concise description of any alternative solutions or features you've considered.

**Additional context**
Add any other context about the feature request here.


================================================
FILE: .github/problem-matchers/address.json
================================================

{
    "problemMatcher": [
        {
            "owner": "asan-problem-matcher",
            "severity": "warning",
            "pattern": [
                {
                    "regexp": "^.*AddressSanitizer: (.*)$",
                    "message": 1
                }
            ]
        }
    ]
}


================================================
FILE: .github/problem-matchers/gcc.json
================================================
{
    "__comment": "Taken from vscode-cpptools's Extension/package.json gcc rule",
    "problemMatcher": [
        {
            "owner": "gcc-problem-matcher",
            "pattern": [
                {
                    "regexp": "^\\.\\./(.*):(\\d+):(\\d+):\\s+(?:fatal\\s+)?(warning|error):\\s+(.*)$",
                    "file": 1,
                    "line": 2,
                    "column": 3,
                    "severity": 4,
                    "message": 5
                }
            ]
        }
    ]
}


================================================
FILE: .github/problem-matchers/undefined.json
================================================
{
    "problemMatcher": [
        {
            "owner": "ubsan-problem-matcher",
            "severity": "warning",
            "pattern": [
                {
                    "regexp": "^.*\\/(src\\/.*):(\\d+):(\\d+): runtime error: (.*)$",
                    "file": 1,
                    "line": 2,
                    "column": 3,
                    "message": 4
                },
                {
                    "regexp": "^.*UndefinedBehaviorSanitizer:.*$"
                }
            ]
        }
    ]
}


================================================
FILE: .github/pull_request_template.md
================================================
**Description**

Please include a summary of the change and which issue is fixed or which feature is added.

- [ ] Issue 1 fixed
- [ ] Issue 2 fixed
- [ ] Feature 1 added
- [ ] Feature 2 added

Fixes # (issue)

**How to test this?**

Please describe the tests that you ran to verify your changes. Provide instructions so we can reproduce if there is no integration test added with this PR. Please also list any relevant details for your test configuration

```bash
cmake ..
make -j8
nrnivmodl mod
./bin/nrnivmodl-core mod
./x86_64/special script.py
./x86_64/special-core --tstop=10 --datpath=coredat
```

**Test System**
 - OS: [e.g. Ubuntu 20.04]
 - Compiler: [e.g. PGI 20.9]
 - Version: [e.g. master branch]
 - Backend: [e.g. CPU]

**Use certain branches in CI pipelines.**
<!-- You can steer which versions of CoreNEURON dependencies will be used in
     the various CI pipelines (GitLab, test-as-submodule) here. Expressions are
     of the form PROJ_REF=VALUE, where PROJ is the relevant Spack package name,
     transformed to upper case and with hyphens replaced with underscores.
     REF may be BRANCH, COMMIT or TAG, with exceptions:
      - SPACK_COMMIT and SPACK_TAG are invalid (hpc/gitlab-pipelines limitation)
      - NEURON_COMMIT and NEURON_TAG are invalid (test-as-submodule limitation)
     These values for NEURON, nmodl and Spack are the defaults and are given
     for illustrative purposes; they can safely be removed.
-->
CI_BRANCHES:NEURON_BRANCH=master,NMODL_BRANCH=master,SPACK_BRANCH=develop


================================================
FILE: .github/workflows/clang_cmake_format_check.yaml
================================================
name: clang-cmake-format-check

concurrency:
  group: ${{ github.workflow }}#${{ github.ref }}
  cancel-in-progress: true

on:
    push:

jobs:
  build:
    name: clang-cmake-format-check
    runs-on: ubuntu-22.04
    steps:
        - name: Fetch repository
          uses: actions/checkout@v3
        - name: Fetch hpc-coding-conventions submodules
          shell: bash
          working-directory: ${{runner.workspace}}/CoreNeuron
          run: git submodule update --init --depth 1 -- CMake/hpc-coding-conventions
        - name: Run clang-format and cmake-format
          shell: bash
          working-directory: ${{runner.workspace}}/CoreNeuron
          run: CMake/hpc-coding-conventions/bin/format -v --dry-run


================================================
FILE: .github/workflows/coreneuron-ci.yml
================================================
name: CoreNEURON CI

concurrency:
  group: ${{ github.workflow }}#${{ github.ref }}
  cancel-in-progress: true

on:
  push:
    branches:
      - master
      - release/**
  pull_request:
    branches:
      - master
      - release/**

env:
  BUILD_TYPE: Release
  DEFAULT_PY_VERSION: 3.8
  MACOSX_DEPLOYMENT_TARGET: 11.0

jobs:
  ci:
    runs-on: ${{ matrix.os }}

    name: ${{ matrix.os }} - ${{ toJson(matrix.config) }})

    env:
      SDK_ROOT: $(xcrun --sdk macosx --show-sdk-path)

    strategy:
      matrix:
        os: [ubuntu-20.04, macOS-11]
        config:
          # Defaults: CORENRN_ENABLE_MPI=ON
          - {cmake_option: "-DCORENRN_ENABLE_MPI_DYNAMIC=ON", flag_warnings: ON}
          - {cmake_option: "-DCORENRN_ENABLE_MPI_DYNAMIC=ON -DCORENRN_ENABLE_SHARED=OFF"}
          - {cmake_option: "-DCORENRN_ENABLE_MPI=OFF"}
          - {use_nmodl: ON, py_version: 3.7}
          - {use_nmodl: ON}
        include:
          - os: ubuntu-20.04
            config:
              gcc_version: 10
          - os: ubuntu-20.04
            config:
              cmake_option: -DCORENRN_ENABLE_DEBUG_CODE=ON
              documentation: ON
          - os: ubuntu-22.04
            config:
              sanitizer: address
          - os: ubuntu-22.04
            config:
              flag_warnings: ON
              sanitizer: undefined
      fail-fast: false

    steps:

      - name: Install homebrew packages
        if: startsWith(matrix.os, 'macOS')
        run: |
          brew update
          brew install bison boost ccache coreutils flex ninja openmpi
          echo /usr/local/opt/flex/bin:/usr/local/opt/bison/bin >> $GITHUB_PATH
        shell: bash

      - name: Install apt packages
        if: startsWith(matrix.os, 'ubuntu')
        run: |
          sudo apt-get install bison ccache doxygen flex libboost-all-dev \
            libfl-dev libopenmpi-dev ninja-build openmpi-bin
        shell: bash

      - name: Install specific apt packages
        if: startsWith(matrix.os, 'ubuntu') && matrix.config.gcc_version
        run: |
          sudo apt-get install gcc-${{matrix.config.gcc_version}}
          echo CC="gcc-${{matrix.config.gcc_version}}" >> $GITHUB_ENV
          echo CXX="g++-${{matrix.config.gcc_version}}" >> $GITHUB_ENV
        shell: bash

      - name: Set up Python3
        uses: actions/setup-python@v4
        with:
          python-version: ${{ env.PYTHON_VERSION }}
        env:
          PYTHON_VERSION: ${{matrix.config.py_version || env.DEFAULT_PY_VERSION}}

      - name: Install NMODL dependencies
        if: ${{ matrix.config.use_nmodl == 'ON' }}
        run: |
          python3 -m pip install --upgrade pip jinja2 pyyaml pytest sympy

      - uses: actions/checkout@v3

      - name: Install documentation dependencies
        if: ${{matrix.config.documentation == 'ON'}}
        working-directory: ${{runner.workspace}}/CoreNeuron
        run: |
          sudo apt-get install doxygen
          python3 -m pip install --upgrade pip
          python3 -m pip install --upgrade -r docs/docs_requirements.txt

      - name: Register compiler warning problem matcher
        if: ${{matrix.config.flag_warnings == 'ON'}}
        run: echo "::add-matcher::.github/problem-matchers/gcc.json"

      - name: Register sanitizer problem matcher
        if: ${{matrix.config.sanitizer}}
        run: echo "::add-matcher::.github/problem-matchers/${{matrix.config.sanitizer}}.json"

      - name: Hash config dictionary
        run: |
          cat << EOF > matrix.json
          ${{toJSON(matrix.config)}}
          EOF
          echo matrix.config JSON:
          cat matrix.json
          echo -----
      
      # Workaround for https://github.com/actions/cache/issues/92
      - name: Checkout cache action
        uses: actions/checkout@v3
        with:
          repository: actions/cache
          ref: v3
          path: tmp/actions/cache
          
      - name: Make actions/cache@v3 run even on failure
        run: |
          sed -i'.bak' -e '/ post-if: /d' tmp/actions/cache/action.yml
          
      - name: Restore compiler cache
        uses: ./tmp/actions/cache
        with:
          path: |
            ${{runner.workspace}}/ccache
          key: ${{matrix.os}}-${{hashfiles('matrix.json')}}-${{github.ref}}-${{github.sha}}
          restore-keys: |
            ${{matrix.os}}-${{hashfiles('matrix.json')}}-${{github.ref}}-
            ${{matrix.os}}-${{hashfiles('matrix.json')}}-

      - name: Build and Test
        id: build-test
        shell: bash
        working-directory: ${{runner.workspace}}/CoreNeuron
        run:  |
          cmake_args=(${{matrix.config.cmake_option}})
          if [[ "${{ startsWith(matrix.os, 'macOS') }}" = "true" ]]; then
              cmake_args+=(-DCORENRN_ENABLE_OPENMP=OFF)
          else
              cmake_args+=(-DCORENRN_ENABLE_OPENMP=ON)
          fi

          if [[ "${{matrix.config.flag_warnings}}" == "ON" ]]; then
              cmake_args+=(-DCORENRN_EXTRA_CXX_FLAGS="-Wall")
          fi

          if [[ -n "${{matrix.config.sanitizer}}" ]]; then
              CC=$(command -v clang-14)
              CXX=$(command -v clang++-14)
              symbolizer_path=$(realpath $(command -v llvm-symbolizer-14))
              cmake_args+=(-DCMAKE_BUILD_TYPE=Custom \
                           -DCMAKE_C_FLAGS="-O1 -g -Wno-writable-strings" \
                           -DCMAKE_CXX_FLAGS="-O1 -g -Wno-writable-strings" \
                           -DLLVM_SYMBOLIZER_PATH="${symbolizer_path}" \
                           -DCORENRN_SANITIZERS=$(echo ${{matrix.config.sanitizer}} | sed -e 's/-/,/g'))
          else
              CC=${CC:-gcc}
              CXX=${CXX:-g++}
          fi
          
          echo "------- Build, Test and Install -------"
          mkdir build && cd build
          if [[ "$USE_NMODL" == "ON" ]]; then
              cmake_args+=(-DCORENRN_ENABLE_NMODL=ON "-DCORENRN_NMODL_FLAGS=sympy --analytic")
          fi
          cmake .. -G Ninja "${cmake_args[@]}" \
            -DCMAKE_C_COMPILER="${CC}" \
            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
            -DCMAKE_CXX_COMPILER="${CXX}" \
            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
            "-DCMAKE_INSTALL_PREFIX=${{runner.workspace}}/install" \
            -DPYTHON_EXECUTABLE=$(command -v python3)
          if ccache --version | grep -E '^ccache version 4\.(4|4\.1)$'
          then
            echo "------- Disable ccache direct mode -------"
            # https://github.com/ccache/ccache/issues/935
            export CCACHE_NODIRECT=1
          fi
          ccache -z
          # Older versions don't support -v (verbose)
          ccache -vs 2>/dev/null || ccache -s
          cmake --build . --parallel
          ccache -vs 2>/dev/null || ccache -s
          ctest -T Test --output-on-failure
          cmake --build . --target install
        env:
          CCACHE_BASEDIR: ${{runner.workspace}}/CoreNeuron
          CCACHE_DIR: ${{runner.workspace}}/ccache
          USE_NMODL: ${{matrix.config.use_nmodl}}

      - uses: actions/upload-artifact@v3
        with:
          name: ctest-results-${{hashfiles('matrix.json')}}-sanitizer
          path: ${{runner.workspace}}/CoreNeuron/build/Testing/*/Test.xml

      # This step will set up an SSH connection on tmate.io for live debugging.
      # To enable it, you have to:
      #   * add 'live-debug-ci' to your PR title
      #   * push something to your PR branch (note that just re-running the pipeline disregards the title update)
      - name: live debug session on failure (manual steps required, check `.github/workflows/coreneuron-ci.yml`)
        if: failure() && contains(github.event.pull_request.title, 'live-debug-ci')
        uses: mxschmitt/action-tmate@v3

      - name: Documentation
        if: ${{ startsWith(matrix.os, 'ubuntu') && matrix.config.documentation == 'ON' }}
        id: documentation
        working-directory: ${{runner.workspace}}/CoreNeuron/build
        run: |
          echo "------- Build Doxygen Documentation -------";
          cmake --build . --target docs
          echo "-------- Disable jekyll --------";
          pushd docs;
          touch .nojekyll;
          echo ::set-output name=status::done
          
      - name: Deploy 🚀
        uses: JamesIves/github-pages-deploy-action@v4
        if: steps.documentation.outputs.status == 'done' && github.ref == 'refs/heads/master'
        with:
          branch: gh-pages # The branch the action should deploy to.
          folder: ${{runner.workspace}}/CoreNeuron/build/docs  # The folder the action should deploy.
          single-commit: true #have a single commit on the deployment branch instead of maintaining the full history


================================================
FILE: .github/workflows/coverage.yml
================================================
name: Coverage

concurrency:
  group: ${{ github.workflow }}#${{ github.ref }}
  cancel-in-progress: true

on:
  push:
    branches:
      - master
      - release/**
  pull_request:
    branches:
      - master
      - release/**

env:
  CMAKE_BUILD_PARALLEL_LEVEL: 3

jobs:
  coverage:
    runs-on: ubuntu-20.04
    name: "Coverage Test"
    steps:
      - name: Install packages
        run: |
          sudo apt-get update
          sudo apt-get install bison doxygen flex lcov libboost-all-dev \
            libopenmpi-dev libfl-dev ninja-build openmpi-bin python3-dev \
            python3-pip
        shell: bash
      - uses: actions/checkout@v3
        with:
          fetch-depth: 2
      - name: Build and Test for Coverage
        id: build-test
        shell: bash
        working-directory: ${{runner.workspace}}/CoreNeuron
        run:  |
          mkdir build && cd build
          cmake .. -G Ninja \
            -DCMAKE_BUILD_TYPE=Debug \
            -DCMAKE_C_FLAGS="-coverage" \
            -DCMAKE_CXX_FLAGS="-coverage" \
            -DCORENRN_ENABLE_MPI=ON \
            -DCORENRN_ENABLE_DEBUG_CODE=ON
          cmake --build .
          (cd ..;  lcov --capture  --initial --directory . --no-external --output-file build/coverage-base.info)
          ctest --output-on-failure
          (cd ..; lcov --capture  --directory . --no-external --output-file build/coverage-run.info)
          lcov --add-tracefile coverage-base.info --add-tracefile coverage-run.info --output-file coverage-combined.info
          lcov --remove coverage-combined.info --output-file coverage.info "*/external/*"
          lcov --list coverage.info
      - name: Upload to codecov.io
        run: |
          # Download codecov script and perform integrity checks
          curl https://keybase.io/codecovsecurity/pgp_keys.asc | gpg --import # One-time step 
          curl -Os https://uploader.codecov.io/latest/linux/codecov 
          curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM 
          curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM.sig 
          gpg --verify codecov.SHA256SUM.sig codecov.SHA256SUM 
          shasum -a 256 -c codecov.SHA256SUM 
          chmod +x codecov 
          ./codecov -f build/coverage.info


================================================
FILE: .github/workflows/test-as-submodule.yml
================================================
name: NEURON submodule

concurrency:
  group: ${{ github.workflow }}#${{ github.ref }}
  cancel-in-progress: true

on:
  push:
    branches:
      - master
      - release/**
  pull_request:
    branches:
      - master
      - release/**

jobs:
  ci:
    name: ${{ matrix.os }}
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        include:
          - os: ubuntu-20.04
            cores: 2
          - os: macOS-11
            cores: 3
      fail-fast: false
    env:
      CMAKE_BUILD_PARALLEL_LEVEL: ${{matrix.cores}}
      SDK_ROOT: $(xcrun --sdk macosx --show-sdk-path)

    steps:

      - name: Install homebrew packages
        if: startsWith(matrix.os, 'macOS')
        run: |
          brew install bison coreutils flex ninja openmpi
          python3 -m pip install --upgrade numpy pytest pytest-cov
          echo /usr/local/opt/flex/bin:/usr/local/opt/bison/bin >> $GITHUB_PATH
          echo "CC=gcc" >> $GITHUB_ENV
          echo "CXX=g++" >> $GITHUB_ENV

      - name: Install apt packages
        if: startsWith(matrix.os, 'ubuntu')
        run: |
          sudo apt-get update
          sudo apt-get install bison cython3 flex libfl-dev libopenmpi-dev \
            ninja-build openmpi-bin python3-dev
          python3 -m pip install --upgrade numpy pytest pytest-cov
          echo "CC=gcc" >> $GITHUB_ENV
          echo "CXX=g++" >> $GITHUB_ENV

      - name: Set NEURON branch
        id: vars
        env:
          GITHUB_PR_BODY: ${{ github.event.pull_request.body }}
        run: |
          nrn_branch=$(echo "${GITHUB_PR_BODY}" | grep "^CI_BRANCHES" \
                      | awk -F '[:,]{1}NEURON_BRANCH=' '{print $2}' \
                      | awk -F ',' '{print $1}')
          if [ -z "$nrn_branch" ]; then
              nrn_branch=master
          fi
          echo "Will use neuron branch: $nrn_branch"
          echo ::set-output name=neuron_branch::"${nrn_branch}"

      - uses: actions/checkout@v3
        name: Checkout NEURON
        with:
          path: nrn
          repository: neuronsimulator/nrn
          ref: ${{ steps.vars.outputs.neuron_branch }}

      - name: Update CoreNEURON submodule
        run: |
          cd ${GITHUB_WORKSPACE}/nrn
          coreneuron_sha=${{github.event.pull_request.head.sha}}
          if [[ -z ${coreneuron_sha} ]]; then
          # presumably we're running on a push event
          coreneuron_sha=${{github.sha}}
          fi
          echo "Using CoreNEURON SHA ${coreneuron_sha}"
          # https://stackoverflow.com/a/33575837
          git update-index --cacheinfo 160000,${coreneuron_sha},external/coreneuron
          git submodule update --init external/coreneuron
          echo "NEURON status"
          git status
          git log -n 1
          cd external/coreneuron
          echo "CoreNEURON status"
          git status
          git log -n 1

      - name: Configure NEURON
        run: |
          cd ${GITHUB_WORKSPACE}/nrn
          mkdir build install
          cd build
          # NEURON CMake assumes this is defined.
          export SHELL=$(command -v bash)
          openMP=" -DCORENRN_ENABLE_OPENMP=ON"
          if [[ "${{ startsWith(matrix.os, 'macOS') }}" = "true" ]]; then
            openMP=" -DCORENRN_ENABLE_OPENMP=OFF"
          fi
          cmake .. -G Ninja \
            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
            -DCMAKE_INSTALL_PREFIX=../install \
            -DPYTHON_EXECUTABLE=$(command -v python3) \
            -DNRN_ENABLE_CORENEURON=ON \
            -DNRN_ENABLE_INTERVIEWS=OFF \
            -DNRN_ENABLE_RX3D=OFF \
            -DNRN_ENABLE_MPI_DYNAMIC=ON \
            -DNRN_ENABLE_TESTS=ON ${openMP}

      - name: Build NEURON
        run: |
          cd ${GITHUB_WORKSPACE}/nrn/build
          cmake --build . --parallel

      - name: Test NEURON
        run: |
          cd ${GITHUB_WORKSPACE}/nrn/build
          ctest --output-on-failure

      - name: Install NEURON
        run: |
          cd ${GITHUB_WORKSPACE}/nrn/build
          cmake --build . --target install

      # This step will set up an SSH connection on tmate.io for live debugging.
      # To enable it, you have to:
      #   * add 'live-debug-ci' to your PR title
      #   * push something to your PR branch (note that just re-running the pipeline disregards the title update)
      - name: live debug session on failure (manual steps required, check `.github/workflows/test-as-submodule.yml`)
        if: failure() && contains(github.event.pull_request.title, 'live-debug-ci')
        uses: mxschmitt/action-tmate@v3


================================================
FILE: .gitignore
================================================
cmake-build-debug*
*build*
spconfig.*
*~
.DS_Store
*.swp
*.srctrl*

# HPC coding conventions
.clang-format
.clang-tidy
.cmake-format.yaml
.pre-commit-config.yaml
.bbp-project-venv/


================================================
FILE: .gitlab-ci.yml
================================================
include:
  - project: hpc/gitlab-pipelines
    file:
      - spack-build-components.gitlab-ci.yml
      - github-project-pipelines.gitlab-ci.yml
    ref: '$GITLAB_PIPELINES_BRANCH'
  - project: hpc/gitlab-upload-logs
    file: enable-upload.yml

variables:
  NEURON_BRANCH:
    description: Branch of NEURON to build against CoreNEURON (NEURON_COMMIT and NEURON_TAG also possible)
    value: master
  NMODL_BRANCH:
    description: Branch of NMODL to build CoreNEURON against (NMODL_COMMIT and NMODL_TAG also possible)
    value: master
  SPACK_BRANCH:
    description: Branch of BlueBrain Spack to use for the CI pipeline
    value: develop
  SPACK_DEPLOYMENT_SUFFIX:
    description: Extra path component used when finding deployed software. Set to something like `pulls/1497` use software built for https://github.com/BlueBrain/spack/pull/1497. You probably want to set SPACK_BRANCH to the branch used in the relevant PR if you set this.
    value: ''

# Set up Spack
spack_setup:
  extends: .spack_setup_ccache
  variables:
    CORENEURON_COMMIT: ${CI_COMMIT_SHA}
    # Enable fetching GitHub PR descriptions and parsing them to find out what
    # branches to build of other projects.
    PARSE_GITHUB_PR_DESCRIPTIONS: "true"

simulation_stack:
  stage: .pre
  # Take advantage of GitHub PR description parsing in the spack_setup job.
  needs: [spack_setup]
  trigger:
    project: hpc/sim/blueconfigs
    # CoreNEURON CI status depends on the BlueConfigs CI status.
    strategy: depend
  variables:
    GITLAB_PIPELINES_BRANCH: $GITLAB_PIPELINES_BRANCH
    SPACK_ENV_FILE_URL: $SPACK_SETUP_COMMIT_MAPPING_URL

# Performance seems to be terrible when we get too many jobs on a single node.
.build:
  extends: [.spack_build]
  variables:
    bb5_ntasks: 2   # so we block 16 cores
    bb5_cpus_per_task: 8 # ninja -j {this}
    bb5_memory: 76G # ~16*384/80

.spack_intel:
  variables:
    SPACK_PACKAGE_COMPILER: intel
.spack_nvhpc:
  variables:
    SPACK_PACKAGE_COMPILER: nvhpc
.build_neuron:
  extends: [.build]
  timeout: two hours
  variables:
    bb5_duration: "2:00:00"
    SPACK_PACKAGE: neuron
    SPACK_PACKAGE_SPEC: +coreneuron+debug+tests~legacy-unit~rx3d model_tests=channel-benchmark,olfactory,tqperf-heavy
.gpu_node:
  variables:
    bb5_constraint: volta
    bb5_cpus_per_task: 2
.test_neuron:
  extends: [.ctest]
  variables:
    bb5_ntasks: 16
    bb5_memory: 76G # ~16*384/80

# Build NMODL once with GCC
build:nmodl:
  extends: [.build]
  variables:
    SPACK_PACKAGE: nmodl
    SPACK_PACKAGE_SPEC: ~legacy-unit
    SPACK_PACKAGE_COMPILER: gcc

# Build CoreNEURON
.build_coreneuron:
  extends: [.build]
  variables:
    SPACK_PACKAGE: coreneuron
    # NEURON depends on py-mpi4py, most of whose dependencies are pulled in by
    # nmodl%gcc, with the exception of MPI, which is pulled in by
    # coreneuron%{nvhpc,intel}. hpe-mpi is an external package anyway, so
    # setting its compiler is just changing how it is labelled in the
    # dependency graph and not changing which installation is used, but this
    # means that in the NEURON step an existing py-mpi4py%gcc can be used.
    # Otherwise a new py-mpi4py with hpe-mpi%{nvhpc,intel} will be built.
    # caliper: papi%nvhpc does not build; use the caliper from the deployment
    # TODO: fix this more robustly so we don't have to play so many games.
    SPACK_PACKAGE_DEPENDENCIES: ^hpe-mpi%gcc ^caliper%gcc+cuda cuda_arch=70

# TODO: improve coverage by switching an Intel build to be statically linked
# TODO: improve coverage by switching an Intel build to RelWithDebInfo
# TODO: improve coverage by enabling +openmp on an Intel build
build:coreneuron:mod2c:intel:shared:debug:
  extends: [.build_coreneuron, .spack_intel]
  variables:
    SPACK_PACKAGE_SPEC: +caliper~gpu~legacy-unit~nmodl~openmp+shared+tests~unified build_type=Debug

build:coreneuron:nmodl:intel:debug:legacy:
  extends: [.build_coreneuron, .spack_intel]
  needs: ["build:nmodl"]
  variables:
    SPACK_PACKAGE_SPEC: +caliper~gpu~legacy-unit+nmodl~openmp~shared~sympy+tests~unified build_type=Debug

# Disable caliper to improve coverage
build:coreneuron:nmodl:intel:shared:debug:
  extends: [.build_coreneuron, .spack_intel]
  needs: ["build:nmodl"]
  variables:
    SPACK_PACKAGE_DEPENDENCIES: ^hpe-mpi%gcc
    SPACK_PACKAGE_SPEC: ~caliper~gpu~legacy-unit+nmodl~openmp+shared+sympy+tests~unified build_type=Debug

# Not linked to a NEURON build+test job, see
# https://github.com/BlueBrain/CoreNeuron/issues/594
build:coreneuron:mod2c:nvhpc:acc:debug:unified:
  extends: [.build_coreneuron, .spack_nvhpc]
  variables:
    SPACK_PACKAGE_SPEC: +caliper+gpu~legacy-unit~nmodl+openmp~shared+tests+unified build_type=Debug

# Shared + OpenACC + OpenMP host threading has problems
build:coreneuron:mod2c:nvhpc:acc:shared:
  extends: [.build_coreneuron, .spack_nvhpc]
  variables:
    SPACK_PACKAGE_SPEC: +caliper+gpu~legacy-unit~nmodl~openmp+shared+tests~unified build_type=RelWithDebInfo

build:coreneuron:nmodl:nvhpc:acc:debug:legacy:
  extends: [.build_coreneuron, .spack_nvhpc]
  needs: ["build:nmodl"]
  variables:
    SPACK_PACKAGE_SPEC: +caliper+gpu~legacy-unit+nmodl~openmp~shared~sympy+tests~unified build_type=Debug

build:coreneuron:nmodl:nvhpc:acc:shared:
  extends: [.build_coreneuron, .spack_nvhpc]
  needs: ["build:nmodl"]
  variables:
    SPACK_PACKAGE_SPEC: +caliper+gpu~legacy-unit+nmodl~openmp+shared+sympy+tests~unified build_type=RelWithDebInfo

build:coreneuron:nmodl:nvhpc:omp:legacy:
  extends: [.build_coreneuron, .spack_nvhpc]
  needs: ["build:nmodl"]
  variables:
    SPACK_PACKAGE_SPEC: +caliper+gpu~legacy-unit+nmodl+openmp~shared~sympy+tests~unified build_type=RelWithDebInfo

build:coreneuron:nmodl:nvhpc:omp:debug:
  extends: [.build_coreneuron, .spack_nvhpc]
  needs: ["build:nmodl"]
  variables:
    SPACK_PACKAGE_SPEC: +caliper+gpu~legacy-unit+nmodl+openmp~shared+sympy+tests~unified build_type=Debug

# Build NEURON
build:neuron:mod2c:intel:shared:debug:
  extends: [.build_neuron, .spack_intel]
  needs: ["build:coreneuron:mod2c:intel:shared:debug"]

build:neuron:nmodl:intel:debug:legacy:
  extends: [.build_neuron, .spack_intel]
  needs: ["build:coreneuron:nmodl:intel:debug:legacy"]

build:neuron:nmodl:intel:shared:debug:
  extends: [.build_neuron, .spack_intel]
  needs: ["build:coreneuron:nmodl:intel:shared:debug"]

build:neuron:mod2c:nvhpc:acc:shared:
  extends: [.build_neuron, .spack_nvhpc]
  needs: ["build:coreneuron:mod2c:nvhpc:acc:shared"]

build:neuron:nmodl:nvhpc:acc:debug:legacy:
  extends: [.build_neuron, .spack_nvhpc]
  needs: ["build:coreneuron:nmodl:nvhpc:acc:debug:legacy"]

build:neuron:nmodl:nvhpc:acc:shared:
  extends: [.build_neuron, .spack_nvhpc]
  needs: ["build:coreneuron:nmodl:nvhpc:acc:shared"]

build:neuron:nmodl:nvhpc:omp:legacy:
  extends: [.build_neuron, .spack_nvhpc]
  needs: ["build:coreneuron:nmodl:nvhpc:omp:legacy"]

build:neuron:nmodl:nvhpc:omp:debug:
  extends: [.build_neuron, .spack_nvhpc]
  needs: ["build:coreneuron:nmodl:nvhpc:omp:debug"]

# Test CoreNEURON
test:coreneuron:mod2c:intel:shared:debug:
  extends: [.ctest]
  needs: ["build:coreneuron:mod2c:intel:shared:debug"]

test:coreneuron:nmodl:intel:debug:legacy:
  extends: [.ctest]
  needs: ["build:coreneuron:nmodl:intel:debug:legacy"]

test:coreneuron:nmodl:intel:shared:debug:
  extends: [.ctest]
  needs: ["build:coreneuron:nmodl:intel:shared:debug"]

test:coreneuron:mod2c:nvhpc:acc:debug:unified:
  extends: [.ctest, .gpu_node]
  needs: ["build:coreneuron:mod2c:nvhpc:acc:debug:unified"]

test:coreneuron:mod2c:nvhpc:acc:shared:
  extends: [.ctest, .gpu_node]
  needs: ["build:coreneuron:mod2c:nvhpc:acc:shared"]

test:coreneuron:nmodl:nvhpc:acc:debug:legacy:
  extends: [.ctest, .gpu_node]
  needs: ["build:coreneuron:nmodl:nvhpc:acc:debug:legacy"]

test:coreneuron:nmodl:nvhpc:acc:shared:
  extends: [.ctest, .gpu_node]
  needs: ["build:coreneuron:nmodl:nvhpc:acc:shared"]

test:coreneuron:nmodl:nvhpc:omp:legacy:
  extends: [.ctest, .gpu_node]
  needs: ["build:coreneuron:nmodl:nvhpc:omp:legacy"]

test:coreneuron:nmodl:nvhpc:omp:debug:
  extends: [.ctest, .gpu_node]
  needs: ["build:coreneuron:nmodl:nvhpc:omp:debug"]

# Test NEURON
test:neuron:mod2c:intel:shared:debug:
  extends: [.test_neuron]
  needs: ["build:neuron:mod2c:intel:shared:debug"]

test:neuron:nmodl:intel:debug:legacy:
  extends: [.test_neuron]
  needs: ["build:neuron:nmodl:intel:debug:legacy"]

test:neuron:nmodl:intel:shared:debug:
  extends: [.test_neuron]
  needs: ["build:neuron:nmodl:intel:shared:debug"]

test:neuron:mod2c:nvhpc:acc:shared:
  extends: [.test_neuron, .gpu_node]
  needs: ["build:neuron:mod2c:nvhpc:acc:shared"]

test:neuron:nmodl:nvhpc:acc:debug:legacy:
  extends: [.test_neuron, .gpu_node]
  needs: ["build:neuron:nmodl:nvhpc:acc:debug:legacy"]

test:neuron:nmodl:nvhpc:acc:shared:
  extends: [.test_neuron, .gpu_node]
  needs: ["build:neuron:nmodl:nvhpc:acc:shared"]

test:neuron:nmodl:nvhpc:omp:legacy:
  extends: [.test_neuron, .gpu_node]
  needs: ["build:neuron:nmodl:nvhpc:omp:legacy"]

test:neuron:nmodl:nvhpc:omp:debug:
  extends: [.test_neuron, .gpu_node]
  needs: ["build:neuron:nmodl:nvhpc:omp:debug"]


================================================
FILE: .gitmodules
================================================
[submodule "external/mod2c"]
  path = external/mod2c
  url = https://github.com/BlueBrain/mod2c
[submodule "external/CLI11"]
  path = external/CLI11
  url = https://github.com/CLIUtils/CLI11.git
[submodule "external/nmodl"]
  path = external/nmodl
  url = https://github.com/BlueBrain/nmodl
[submodule "external/Random123"]
	path = external/Random123
	url = https://github.com/BlueBrain/Random123.git
[submodule "CMake/hpc-coding-conventions"]
	path = CMake/hpc-coding-conventions
	url = https://github.com/BlueBrain/hpc-coding-conventions.git


================================================
FILE: .readthedocs.yml
================================================
version: 2

conda:
  environment: docs/conda_environment.yml

python:
  install:
    - requirements: docs/docs_requirements.txt


================================================
FILE: .sanitizers/undefined.supp
================================================
unsigned-integer-overflow:_philox4x32bumpkey(r123array2x32)
unsigned-integer-overflow:coreneuron::TNode::mkhash()
unsigned-integer-overflow:std::mersenne_twister_engine


================================================
FILE: AUTHORS.txt
================================================
Akiko Sato
Aleksandr Ovcharenko
Alessandro Cattabiani
Alexander Dietz
Alexandru Săvulescu
Antonio Bellotta
Baudouin Del Marmol
Bruno Magalhaes
Christos Kotsalos
Fabien Delalondre
Felix Schuermann (contributor)
Fernando Pereira
Francesco Cremonesi
Ioannis Magkanaris
James Gonzalo King
Jeremy Fouriaux
Jorge Blanco Alonso
Kai Langen
Michael Lee Hines
Nicolas Cornu
Olli Lupton
Omar Awile
Oren Amsalem
Pramod Shivaji Kumbhar (maintainer)
Sam Yates
Sergio Rivas-Gomez
Tapasweni Pathak
Weina Ji
viniciusdepadua


================================================
FILE: CMake/AddHpcCodingConvSubmodule.cmake
================================================
# =============================================================================
# Copyright (C) 2016-2021 Blue Brain Project
#
# See top-level LICENSE file for details.
# =============================================================================

include(FindPackageHandleStandardArgs)
find_package(FindPkgConfig QUIET)

find_path(
  HpcCodingConv_PROJ
  NAMES setup.cfg
  PATHS "${CORENEURON_PROJECT_SOURCE_DIR}/CMake/hpc-coding-conventions/")

find_package_handle_standard_args(HpcCodingConv REQUIRED_VARS HpcCodingConv_PROJ)

if(NOT HpcCodingConv_FOUND)
  find_package(Git 1.8.3 QUIET)
  if(NOT ${GIT_FOUND})
    message(FATAL_ERROR "git not found, clone repository with --recursive")
  endif()
  message(
    STATUS "Sub-module CMake/hpc-coding-conventions missing: running git submodule update --init")
  execute_process(
    COMMAND ${GIT_EXECUTABLE} submodule update --init --
            ${CORENEURON_PROJECT_SOURCE_DIR}/CMake/hpc-coding-conventions
    WORKING_DIRECTORY ${CORENEURON_PROJECT_SOURCE_DIR})
endif()


================================================
FILE: CMake/AddMod2cSubmodule.cmake
================================================
# =============================================================================
# Copyright (C) 2016-2021 Blue Brain Project
#
# See top-level LICENSE file for details.
# =============================================================================

find_package(FindPkgConfig QUIET)

find_path(
  MOD2C_PROJ
  NAMES CMakeLists.txt
  PATHS "${CORENEURON_PROJECT_SOURCE_DIR}/external/mod2c")

find_package_handle_standard_args(MOD2C REQUIRED_VARS MOD2C_PROJ)

if(NOT MOD2C_FOUND)
  find_package(Git 1.8.3 QUIET)
  if(NOT ${GIT_FOUND})
    message(FATAL_ERROR "git not found, clone repository with --recursive")
  endif()
  message(STATUS "Sub-module mod2c missing : running git submodule update --init --recursive")
  execute_process(
    COMMAND ${GIT_EXECUTABLE} submodule update --init --recursive --
            ${CORENEURON_PROJECT_SOURCE_DIR}/external/mod2c
    WORKING_DIRECTORY ${CORENEURON_PROJECT_SOURCE_DIR})
else()
  message(STATUS "Using mod2c submodule from ${MOD2C_PROJ}")
endif()

add_subdirectory(${CORENEURON_PROJECT_SOURCE_DIR}/external/mod2c)


================================================
FILE: CMake/AddNmodlSubmodule.cmake
================================================
# =============================================================================
# Copyright (C) 2016-2021 Blue Brain Project
#
# See top-level LICENSE file for details.
# =============================================================================

find_package(FindPkgConfig QUIET)

find_path(
  NMODL_PROJ
  NAMES CMakeLists.txt
  PATHS "${CORENEURON_PROJECT_SOURCE_DIR}/external/nmodl")

find_package_handle_standard_args(NMODL REQUIRED_VARS NMODL_PROJ)

if(NOT NMODL_FOUND)
  find_package(Git 1.8.3 QUIET)
  if(NOT ${GIT_FOUND})
    message(FATAL_ERROR "git not found, clone repository with --recursive")
  endif()
  message(STATUS "Sub-module nmodl missing : running git submodule update --init")
  execute_process(
    COMMAND ${GIT_EXECUTABLE} submodule update --init --
            ${CORENEURON_PROJECT_SOURCE_DIR}/external/nmodl
    WORKING_DIRECTORY ${CORENEURON_PROJECT_SOURCE_DIR})
else()
  message(STATUS "Using nmodl submodule from ${NMODL_PROJ}")
endif()

add_subdirectory(${CORENEURON_PROJECT_SOURCE_DIR}/external/nmodl)


================================================
FILE: CMake/AddRandom123Submodule.cmake
================================================
# =============================================================================
# Copyright (C) 2016-2021 Blue Brain Project
#
# See top-level LICENSE file for details.
# =============================================================================

include(FindPackageHandleStandardArgs)
find_package(FindPkgConfig QUIET)

find_path(
  Random123_PROJ
  NAMES LICENSE
  PATHS "${CORENEURON_PROJECT_SOURCE_DIR}/external/Random123"
  NO_CMAKE_PATH NO_CMAKE_ENVIRONMENT_PATH NO_SYSTEM_ENVIRONMENT_PATH NO_CMAKE_SYSTEM_PATH)

find_package_handle_standard_args(Random123 REQUIRED_VARS Random123_PROJ)

if(NOT Random123_FOUND)
  find_package(Git 1.8.3 QUIET)
  if(NOT ${GIT_FOUND})
    message(FATAL_ERROR "git not found, clone repository with --recursive")
  endif()
  message(STATUS "Sub-module Random123 missing: running git submodule update --init --recursive")
  execute_process(
    COMMAND ${GIT_EXECUTABLE} submodule update --init --recursive --
            ${CORENEURON_PROJECT_SOURCE_DIR}/external/Random123
    WORKING_DIRECTORY ${CORENEURON_PROJECT_SOURCE_DIR})
endif()


================================================
FILE: CMake/CrayPortability.cmake
================================================
# =============================================================================
# Copyright (C) 2016-2021 Blue Brain Project
#
# See top-level LICENSE file for details.
# =============================================================================

if(IS_DIRECTORY "/opt/cray")
  set(CRAY_SYSTEM TRUE)
endif()

if(CRAY_SYSTEM)
  # default build type is static for cray
  if(NOT DEFINED COMPILE_LIBRARY_TYPE)
    set(COMPILE_LIBRARY_TYPE "STATIC")
  endif()

  # Cray wrapper take care of everything!
  set(MPI_LIBRARIES "")
  set(MPI_C_LIBRARIES "")
  set(MPI_CXX_LIBRARIES "")

  # ~~~
  # instead of -rdynamic, cray wrapper needs either -dynamic or -static(default)
  # also cray compiler needs fPIC flag
  # ~~~
  if(COMPILE_LIBRARY_TYPE STREQUAL "SHARED")
    set(CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS "-dynamic")
    # TODO: add Cray compiler flag configurations in CompilerFlagsHelpers.cmake
    if(CMAKE_C_COMPILER_IS_CRAY)
      set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
    endif()

  else()
    set(CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS "")
  endif()
else()
  # default is shared library
  if(NOT DEFINED COMPILE_LIBRARY_TYPE)
    set(COMPILE_LIBRARY_TYPE "SHARED")
  endif()
endif()


================================================
FILE: CMake/GitRevision.cmake
================================================
# =============================================================================
# Copyright (C) 2016-2021 Blue Brain Project
#
# See top-level LICENSE file for details.
# =============================================================================

# ~~~
# For now use simple approach to get version information as git is often
# avaialble on the machine where we are building from source
# ~~~

find_package(Git)

if(GIT_FOUND)
  # get last commit sha1
  execute_process(
    COMMAND ${GIT_EXECUTABLE} -c log.showSignature=false log -1 --format=%h
    WORKING_DIRECTORY ${CORENEURON_PROJECT_SOURCE_DIR}
    OUTPUT_VARIABLE GIT_REVISION_SHA1
    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
  # get last commit date
  execute_process(
    COMMAND ${GIT_EXECUTABLE} -c log.showSignature=false show -s --format=%ci
    WORKING_DIRECTORY ${CORENEURON_PROJECT_SOURCE_DIR}
    OUTPUT_VARIABLE GIT_REVISION_DATE
    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
  set(CN_GIT_REVISION "${GIT_REVISION_SHA1} (${GIT_REVISION_DATE})")
else()
  set(CN_GIT_REVISION "unknown")
endif()


================================================
FILE: CMake/MakefileBuildOptions.cmake
================================================
# =============================================================================
# Copyright (C) 2016-2022 Blue Brain Project
#
# See top-level LICENSE file for details.
# =============================================================================

# =============================================================================
# NMODL CLI options : common and backend specific
# =============================================================================
# ~~~
# if user pass arguments then use those as common arguments
# note that inlining is done by default
# ~~~
set(NMODL_COMMON_ARGS "passes --inline")

if(NOT "${CORENRN_NMODL_FLAGS}" STREQUAL "")
  string(APPEND NMODL_COMMON_ARGS " ${CORENRN_NMODL_FLAGS}")
endif()

set(NMODL_CPU_BACKEND_ARGS "host --c")
set(NMODL_ACC_BACKEND_ARGS "host --c acc --oacc")

# =============================================================================
# Construct the linker arguments that are used inside nrnivmodl-core (to build libcorenrnmech from
# libcoreneuron-core, libcoreneuron-cuda and mechanism object files) and inside nrnivmodl (to link
# NEURON's special against CoreNEURON's libcorenrnmech). These are stored in two global properties:
# CORENRN_LIB_LINK_FLAGS (used by NEURON/nrnivmodl to link special against CoreNEURON) and
# CORENRN_LIB_LINK_DEP_FLAGS (used by CoreNEURON/nrnivmodl-core to link libcorenrnmech.so).
# Conceptually: CORENRN_LIB_LINK_FLAGS = -lcorenrnmech $CORENRN_LIB_LINK_DEP_FLAGS
# =============================================================================
if(NOT CORENRN_ENABLE_SHARED)
  set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_FLAGS " -Wl,--whole-archive")
endif()
set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_FLAGS " -lcorenrnmech")
if(NOT CORENRN_ENABLE_SHARED)
  set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_FLAGS " -Wl,--no-whole-archive")
endif()
# Essentially we "just" want to unpack the CMake dependencies of the `coreneuron-core` target into a
# plain string that we can bake into the Makefiles in both NEURON and CoreNEURON.
function(coreneuron_process_library_path library)
  get_filename_component(library_dir "${library}" DIRECTORY)
  if(NOT library_dir)
    # In case target is not a target but is just the name of a library, e.g. "dl"
    set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_DEP_FLAGS " -l${library}")
  elseif("${library_dir}" MATCHES "^(/lib|/lib64|/usr/lib|/usr/lib64)$")
    # e.g. /usr/lib64/libpthread.so -> -lpthread TODO: consider using
    # https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_IMPLICIT_LINK_DIRECTORIES.html, or
    # dropping this special case entirely
    get_filename_component(libname ${library} NAME_WE)
    string(REGEX REPLACE "^lib" "" libname ${libname})
    set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_DEP_FLAGS " -l${libname}")
  else()
    # It's a full path, include that on the line
    set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_DEP_FLAGS
                                               " -Wl,-rpath,${library_dir} ${library}")
  endif()
endfunction()
function(coreneuron_process_target target)
  if(TARGET ${target})
    if(NOT target STREQUAL "coreneuron-core")
      # This is a special case: libcoreneuron-core.a is manually unpacked into .o files by the
      # nrnivmodl-core Makefile, so we do not want to also emit an -lcoreneuron-core argument.
      get_target_property(target_inc_dirs ${target} INTERFACE_INCLUDE_DIRECTORIES)
      if(target_inc_dirs)
        foreach(inc_dir_genex ${target_inc_dirs})
          string(GENEX_STRIP "${inc_dir_genex}" inc_dir)
          if(inc_dir)
            set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_EXTRA_COMPILE_FLAGS " -I${inc_dir}")
          endif()
        endforeach()
      endif()
      get_target_property(target_imported ${target} IMPORTED)
      if(target_imported)
        # In this case we can extract the full path to the library
        get_target_property(target_location ${target} LOCATION)
        coreneuron_process_library_path(${target_location})
      else()
        # This is probably another of our libraries, like -lcoreneuron-cuda. We might need to add -L
        # and an RPATH later.
        set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_DEP_FLAGS " -l${target}")
      endif()
    endif()
    get_target_property(target_libraries ${target} LINK_LIBRARIES)
    if(target_libraries)
      foreach(child_target ${target_libraries})
        coreneuron_process_target(${child_target})
      endforeach()
    endif()
    return()
  endif()
  coreneuron_process_library_path("${target}")
endfunction()
coreneuron_process_target(coreneuron-core)
get_property(CORENRN_LIB_LINK_DEP_FLAGS GLOBAL PROPERTY CORENRN_LIB_LINK_DEP_FLAGS)
set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_FLAGS " ${CORENRN_LIB_LINK_DEP_FLAGS}")
# In static builds then NEURON uses dlopen(nullptr, ...) to look for the corenrn_embedded_run
# symbol, which comes from libcoreneuron-core.a and gets included in libcorenrnmech.
if(NOT CORENRN_ENABLE_SHARED)
  set_property(GLOBAL APPEND_STRING PROPERTY CORENRN_LIB_LINK_FLAGS " -rdynamic")
endif()
get_property(CORENRN_EXTRA_COMPILE_FLAGS GLOBAL PROPERTY CORENRN_EXTRA_COMPILE_FLAGS)
get_property(CORENRN_LIB_LINK_FLAGS GLOBAL PROPERTY CORENRN_LIB_LINK_FLAGS)

# Detect if --start-group and --end-group are valid linker arguments. These are typically needed
# when linking mutually-dependent .o files (or where we don't know the correct order) on Linux, but
# they are not needed *or* recognised by the macOS linker.
if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
  include(CheckLinkerFlag)
  check_linker_flag(CXX -Wl,--start-group CORENRN_CXX_LINKER_SUPPORTS_START_GROUP)
elseif(CMAKE_SYSTEM_NAME MATCHES Linux)
  # Assume that --start-group and --end-group are only supported on Linux
  set(CORENRN_CXX_LINKER_SUPPORTS_START_GROUP ON)
endif()
if(CORENRN_CXX_LINKER_SUPPORTS_START_GROUP)
  set(CORENEURON_LINKER_START_GROUP -Wl,--start-group)
  set(CORENEURON_LINKER_END_GROUP -Wl,--end-group)
endif()

# Things that used to be in CORENRN_LIB_LINK_FLAGS: -lrt -L${CMAKE_HOST_SYSTEM_PROCESSOR}
# -L${caliper_LIB_DIR} -l${CALIPER_LIB}

# =============================================================================
# Turn CORENRN_COMPILE_DEFS into a list of -DFOO[=BAR] options.
# =============================================================================
list(TRANSFORM CORENRN_COMPILE_DEFS PREPEND -D OUTPUT_VARIABLE CORENRN_COMPILE_DEF_FLAGS)

# =============================================================================
# Extra link flags that we need to include when linking libcorenrnmech.{a,so} in CoreNEURON but that
# do not need to be passed to NEURON to use when linking nrniv/special (why?)
# =============================================================================
string(JOIN " " CORENRN_COMMON_LDFLAGS ${CORENRN_LIB_LINK_DEP_FLAGS} ${CORENRN_EXTRA_LINK_FLAGS})
if(CORENRN_SANITIZER_LIBRARY_DIR)
  string(APPEND CORENRN_COMMON_LDFLAGS " -Wl,-rpath,${CORENRN_SANITIZER_LIBRARY_DIR}")
endif()
string(JOIN " " CORENRN_SANITIZER_ENABLE_ENVIRONMENT_STRING ${CORENRN_SANITIZER_ENABLE_ENVIRONMENT})

# =============================================================================
# compile flags : common to all backend
# =============================================================================
string(TOUPPER "${CMAKE_BUILD_TYPE}" _BUILD_TYPE)
string(
  JOIN
  " "
  CORENRN_CXX_FLAGS
  ${CMAKE_CXX_FLAGS}
  ${CMAKE_CXX_FLAGS_${_BUILD_TYPE}}
  ${CMAKE_CXX17_STANDARD_COMPILE_OPTION}
  ${NVHPC_ACC_COMP_FLAGS}
  ${NVHPC_CXX_INLINE_FLAGS}
  ${CORENRN_COMPILE_DEF_FLAGS}
  ${CORENRN_EXTRA_MECH_CXX_FLAGS}
  ${CORENRN_EXTRA_COMPILE_FLAGS})

# =============================================================================
# nmodl/mod2c related options : TODO
# =============================================================================
# name of nmodl/mod2c binary
get_filename_component(nmodl_name ${CORENRN_MOD2CPP_BINARY} NAME)
set(nmodl_binary_name ${nmodl_name})


================================================
FILE: CMake/OpenAccHelper.cmake
================================================
# =============================================================================
# Copyright (C) 2016-2021 Blue Brain Project
#
# See top-level LICENSE file for details.
# =============================================================================

# Helper to parse X.Y[.{anything] into X.Y
function(cnrn_parse_version FULL_VERSION)
  cmake_parse_arguments(PARSE_ARGV 1 CNRN_PARSE_VERSION "" "OUTPUT_MAJOR_MINOR" "")
  if(NOT "${CNRN_PARSE_VERSION_UNPARSED_ARGUMENTS}" STREQUAL "")
    message(
      FATAL_ERROR
        "cnrn_parse_version got unexpected arguments: ${CNRN_PARSE_VERSION_UNPARSED_ARGUMENTS}")
  endif()
  string(FIND ${FULL_VERSION} . first_dot)
  math(EXPR first_dot_plus_one "${first_dot}+1")
  string(SUBSTRING ${FULL_VERSION} ${first_dot_plus_one} -1 minor_and_later)
  string(FIND ${minor_and_later} . second_dot_relative)
  if(${first_dot} EQUAL -1 OR ${second_dot_relative} EQUAL -1)
    message(FATAL_ERROR "Failed to parse major.minor from ${FULL_VERSION}")
  endif()
  math(EXPR second_dot_plus_one "${first_dot}+${second_dot_relative}+1")
  string(SUBSTRING ${FULL_VERSION} 0 ${second_dot_plus_one} major_minor)
  set(${CNRN_PARSE_VERSION_OUTPUT_MAJOR_MINOR}
      ${major_minor}
      PARENT_SCOPE)
endfunction()

# =============================================================================
# Prepare compiler flags for GPU target
# =============================================================================
if(CORENRN_ENABLE_GPU)
  # Get the NVC++ version number for use in nrnivmodl_core_makefile.in
  cnrn_parse_version(${CMAKE_CXX_COMPILER_VERSION} OUTPUT_MAJOR_MINOR
                     CORENRN_NVHPC_MAJOR_MINOR_VERSION)
  # Enable cudaProfiler{Start,Stop}() behind the Instrumentor::phase... APIs
  list(APPEND CORENRN_COMPILE_DEFS CORENEURON_CUDA_PROFILING CORENEURON_ENABLE_GPU)
  # Plain C++ code in CoreNEURON may need to use CUDA runtime APIs for, for example, starting and
  # stopping profiling. This makes sure those headers can be found.
  include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
  # cuda unified memory support
  if(CORENRN_ENABLE_CUDA_UNIFIED_MEMORY)
    list(APPEND CORENRN_COMPILE_DEFS CORENEURON_UNIFIED_MEMORY)
  endif()
  if(${CMAKE_VERSION} VERSION_LESS 3.17)
    # Hopefully we can drop this soon. Parse ${CMAKE_CUDA_COMPILER_VERSION} into a shorter X.Y
    # version without any patch version.
    if(NOT ${CMAKE_CUDA_COMPILER_ID} STREQUAL "NVIDIA")
      message(FATAL_ERROR "Unsupported CUDA compiler ${CMAKE_CUDA_COMPILER_ID}")
    endif()
    cnrn_parse_version(${CMAKE_CUDA_COMPILER_VERSION} OUTPUT_MAJOR_MINOR CORENRN_CUDA_VERSION_SHORT)
  else()
    # This is a lazy way of getting the major/minor versions separately without parsing
    # ${CMAKE_CUDA_COMPILER_VERSION}
    find_package(CUDAToolkit 9.0 REQUIRED)
    # Be a bit paranoid
    if(NOT ${CMAKE_CUDA_COMPILER_VERSION} STREQUAL ${CUDAToolkit_VERSION})
      message(
        FATAL_ERROR
          "CUDA compiler (${CMAKE_CUDA_COMPILER_VERSION}) and toolkit (${CUDAToolkit_VERSION}) versions are not the same!"
      )
    endif()
    set(CORENRN_CUDA_VERSION_SHORT "${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}")
  endif()
  # -cuda links CUDA libraries and also seems to be important to make the NVHPC do the device code
  # linking. Without this, we had problems with linking between the explicit CUDA (.cu) device code
  # and offloaded OpenACC/OpenMP code. Using -cuda when compiling seems to improve error messages in
  # some cases, and to be recommended by NVIDIA. We pass -gpu=cudaX.Y to ensure that OpenACC/OpenMP
  # code is compiled with the same CUDA version as the explicit CUDA code.
  set(NVHPC_ACC_COMP_FLAGS "-cuda -gpu=cuda${CORENRN_CUDA_VERSION_SHORT}")
  # Combining -gpu=lineinfo with -O0 -g gives a warning: Conflicting options --device-debug and
  # --generate-line-info specified, ignoring --generate-line-info option
  if(CMAKE_BUILD_TYPE STREQUAL "Debug")
    string(APPEND NVHPC_ACC_COMP_FLAGS ",debug")
  else()
    string(APPEND NVHPC_ACC_COMP_FLAGS ",lineinfo")
  endif()
  # Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA
  # code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the
  # same default compute capabilities as each other, particularly on GPU-less build machines.
  foreach(compute_capability ${CMAKE_CUDA_ARCHITECTURES})
    string(APPEND NVHPC_ACC_COMP_FLAGS ",cc${compute_capability}")
  endforeach()
  if(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenMP")
    # Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available
    # for a region then prefer OpenMP.
    list(APPEND CORENRN_COMPILE_DEFS CORENEURON_PREFER_OPENMP_OFFLOAD)
    string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu")
  elseif(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenACC")
    # Only enable OpenACC offload for GPU
    string(APPEND NVHPC_ACC_COMP_FLAGS " -acc")
  else()
    message(FATAL_ERROR "${CORENRN_ACCELERATOR_OFFLOAD} not supported with NVHPC compilers")
  endif()
  string(APPEND CMAKE_EXE_LINKER_FLAGS " ${NVHPC_ACC_COMP_FLAGS}")
  # Use `-Mautoinline` option to compile .cpp files generated from .mod files only. This is
  # especially needed when we compile with -O0 or -O1 optimisation level where we get link errors.
  # Use of `-Mautoinline` ensure that the necessary functions like `net_receive_kernel` are inlined
  # for OpenACC code generation.
  set(NVHPC_CXX_INLINE_FLAGS "-Mautoinline")
endif()

# =============================================================================
# Initialise global properties that will be used by NEURON to link with CoreNEURON
# =============================================================================
if(CORENRN_ENABLE_GPU)
  # CORENRN_LIB_LINK_FLAGS is the full set of flags needed to link against libcorenrnmech.so:
  # something like `-acc -lcorenrnmech ...`. CORENRN_NEURON_LINK_FLAGS only contains flags that need
  # to be used when linking the NEURON Python module to make sure it is able to dynamically load
  # libcorenrnmech.so.
  set_property(GLOBAL PROPERTY CORENRN_LIB_LINK_FLAGS "${NVHPC_ACC_COMP_FLAGS}")
  if(CORENRN_ENABLE_SHARED)
    # Because of
    # https://forums.developer.nvidia.com/t/dynamically-loading-an-openacc-enabled-shared-library-from-an-executable-compiled-with-nvc-does-not-work/210968
    # we have to tell NEURON to pass OpenACC flags when linking special, otherwise we end up with an
    # `nrniv` binary that cannot dynamically load CoreNEURON in shared-library builds.
    set_property(GLOBAL PROPERTY CORENRN_NEURON_LINK_FLAGS "${NVHPC_ACC_COMP_FLAGS}")
  endif()
endif()

# NEURON needs to have access to this when CoreNEURON is built as a submodule. If CoreNEURON is
# installed externally then this is set via coreneuron-config.cmake
set_property(GLOBAL PROPERTY CORENRN_ENABLE_SHARED ${CORENRN_ENABLE_SHARED})

if(CORENRN_HAVE_NVHPC_COMPILER)
  if(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL 20.7)
    # https://forums.developer.nvidia.com/t/many-all-diagnostic-numbers-increased-by-1-from-previous-values/146268/3
    # changed the numbering scheme in newer versions. The following list is from a clean start 13
    # August 2021. It would clearly be nicer to apply these suppressions only to relevant files.
    # Examples of the suppressed warnings are given below.
    # ~~~
    # "include/Random123/array.h", warning #111-D: statement is unreachable
    # "include/Random123/features/sse.h", warning #550-D: variable "edx" was set but never used
    # ~~~
    set(CORENEURON_CXX_WARNING_SUPPRESSIONS --diag_suppress=111,550)
    # This one can be a bit more targeted
    # ~~~
    # "boost/test/unit_test_log.hpp", warning #612-D: overloaded virtual function "..." is only partially overridden in class "..."
    # ~~~
    set(CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS --diag_suppress=612)
    # Extra suppressions for .cpp files translated from .mod files.
    # ~~~
    # "x86_64/corenrn/mod2c/pattern.cpp", warning #161-D: unrecognized #pragma
    # "x86_64/corenrn/mod2c/svclmp.cpp", warning #177-D: variable "..." was declared but never referenced
    # ~~~
    string(JOIN " " CORENEURON_TRANSLATED_CODE_COMPILE_FLAGS ${CORENEURON_CXX_WARNING_SUPPRESSIONS}
           --diag_suppress=161,177)
  endif()
endif()


================================================
FILE: CMake/TestScriptUtils.cmake
================================================
# =============================================================================
# Copyright (C) 2016-2021 Blue Brain Project
#
# See top-level LICENSE file for details.
# =============================================================================

# ~~~
# Utility functions for manipulating test labels and producing
# tests from scripts:
#
# 1. add_test_class(label [label2 ...])
#
#    Create a target with name test-label (or test-label-label2 etc.)
#    which runs only those tests possessing all of the supplied labels.
#
#
# 2. add_test_label(name label ...)
#
#    Add the given labels to the test 'name'.
#
#
# 3. add_test_script(name script interp)
#
#    Add a test 'name' that runs the given script, using the
#    interpreter 'interp'. If no interpreter is supplied,
#    the script will be run with /bin/sh.
#
#    Uses the following variables to customize the new test:
#    * TEST_LABEL, ${NAME}_TEST_LABEL
#          If defined, apply the label(s) in these variable to the
#          new test.
#    * TEST_ARGS, ${NAME}_TEST_ARGS
#          Additional arguments to pass to the script.
#          ${NAME}_TEST_ARGS takes priority over TEST_ARGS.
#    * TEST_ENVIRONMENT
#          Additional environment variables to define for the test;
#          added to test properties.
#    * TEST_PREFIX, ${NAME}_TEST_PREFIX
#          If defined, preface the interpreter with this prefix.
#          ${NAME}_TEST_PREFIX takes priority over TEST_PREFIX.
# ~~~

function(add_test_label NAME)
  set_property(
    TEST ${NAME}
    APPEND
    PROPERTY LABELS ${ARGN})
  # create test classes for each label
  foreach(L ${ARGN})
    add_test_class(${L})
  endforeach()
endfunction()

function(add_test_script NAME SCRIPT INTERP)
  set(RUN_PREFIX ${TEST_PREFIX})
  if(${NAME}_TEST_PREFIX)
    set(RUN_PREFIX ${${NAME}_TEST_PREFIX})
  endif()

  if(NOT INTERP)
    set(INTERP "/bin/sh")
  endif()

  set(RUN_ARGS ${TEST_ARGS})
  if(${NAME}_TEST_ARGS)
    set(RUN_ARGS ${${NAME}_TEST_ARGS})
  endif()

  set(SCRIPT_PATH "${SCRIPT}")
  if(NOT IS_ABSOLUTE "${SCRIPT_PATH}")
    set(SCRIPT_PATH "${CMAKE_CURRENT_SOURCE_DIR}/${SCRIPT_PATH}")
  endif()

  add_test(
    NAME ${NAME}
    COMMAND ${RUN_PREFIX} ${INTERP} "${SCRIPT_PATH}" ${RUN_ARGS}
    WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}")

  # Add test labels
  set(TEST_LABELS ${TEST_LABEL} ${${NAME}_TEST_LABEL})
  if(TEST_LABELS)
    add_test_label(${NAME} ${TEST_LABELS})
  endif()

  if(TEST_ENVIRONMENT)
    set_property(TEST ${NAME} PROPERTY ENVIRONMENT ${TEST_ENVIRONMENT})
  endif()
endfunction()

function(add_test_class)
  string(REPLACE ";" "-" TEST_SUFFIX "${ARGN}")
  string(REPLACE ";" "$$;-L;^" TEST_LOPTS "${ARGN}")

  if(NOT TARGET test-${TEST_SUFFIX})
    add_custom_target(
      "test-${TEST_SUFFIX}"
      COMMAND ${CMAKE_CTEST_COMMAND} -L ^${TEST_LOPTS}$$
      WORKING_DIRECTORY ${${PROJECT_NAME}_BINARY_DIR}
      COMMENT "Running all ${ARGN} tests")
  endif()
endfunction()


================================================
FILE: CMake/config/CompilerFlagsHelpers.cmake
================================================
# =============================================================================
# Copyright (C) 2016-2021 Blue Brain Project
#
# See top-level LICENSE file for details.
# =============================================================================

# ~~~
# CompilerFlagsHelpers.cmake
# set of Convenience functions for portable compiler flags
# ~~~

set(SUPPORTED_COMPILER_LANGUAGE_LIST "CXX")

# detect compiler
foreach(COMPILER_LANGUAGE ${SUPPORTED_COMPILER_LANGUAGE_LIST})
  if(CMAKE_${COMPILER_LANGUAGE}_COMPILER_ID STREQUAL "XL")
    set(CMAKE_${COMPILER_LANGUAGE}_COMPILER_IS_XLC ON)
  elseif(CMAKE_${COMPILER_LANGUAGE}_COMPILER_ID STREQUAL "Intel")
    set(CMAKE_${COMPILER_LANGUAGE}_COMPILER_IS_ICC ON)
  elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
    set(CMAKE_${COMPILER_LANGUAGE}_COMPILER_IS_MSVC)
  elseif(${CMAKE_${COMPILER_LANGUAGE}_COMPILER_ID} STREQUAL "Clang")
    set(CMAKE_${COMPILER_LANGUAGE}_COMPILER_IS_CLANG ON)
  elseif(CMAKE_${COMPILER_LANGUAGE}_COMPILER_ID STREQUAL "GNU")
    set(CMAKE_${COMPILER_LANGUAGE}_COMPILER_IS_GCC ON)
  elseif(CMAKE_${COMPILER_LANGUAGE}_COMPILER_ID STREQUAL "Cray")
    set(CMAKE_${COMPILER_LANGUAGE}_COMPILER_IS_CRAY ON)
  endif()
endforeach()

foreach(COMPILER_LANGUAGE ${SUPPORTED_COMPILER_LANGUAGE_LIST})
  # XLC compiler
  if(CMAKE_${COMPILER_LANGUAGE}_COMPILER_IS_XLC)
    # ~~~
    # XLC -qinfo=all is awfully verbose on any platforms that use the GNU STL
    # Enable by default only the relevant one
    # ~~~
    set(CMAKE_${COMPILER_LANGUAGE}_WARNING_ALL "-qformat=all -qinfo=lan:trx:ret:zea:cmp:ret")

    set(CMAKE_${COMPILER_LANGUAGE}_DEBUGINFO_FLAGS "-g")

    set(CMAKE_${COMPILER_LANGUAGE}_OPT_NONE "-O0")
    set(CMAKE_${COMPILER_LANGUAGE}_OPT_NORMAL "-O2")
    set(CMAKE_${COMPILER_LANGUAGE}_OPT_AGGRESSIVE "-O3")
    set(CMAKE_${COMPILER_LANGUAGE}_OPT_FASTEST "-O5")

    set(CMAKE_${COMPILER_LANGUAGE}_STACK_PROTECTION "-qstackprotect")

    set(CMAKE_${COMPILER_LANGUAGE}_POSITION_INDEPENDENT "-qpic=small")

    set(CMAKE_${COMPILER_LANGUAGE}_VECTORIZE "-qhot")
    set(ADDITIONAL_THREADSAFE_FLAGS "-qthreaded")
    set(IGNORE_UNKNOWN_PRAGMA_FLAGS "-qsuppress=1506-224")

    # Microsoft compiler
  elseif(CMAKE_${COMPILER_LANGUAGE}_COMPILER_IS_MSVC)

    set(CMAKE_${COMPILER_LANGUAGE}_DEBUGINFO_FLAGS "-Zi")

    set(CMAKE_${COMPILER_LANGUAGE}_OPT_NONE "")
    set(CMAKE_${COMPILER_LANGUAGE}_OPT_NORMAL "-O2")
    set(CMAKE_${COMPILER_LANGUAGE}_OPT_AGGRESSIVE "-O2")
    set(CMAKE_${COMPILER_LANGUAGE}_OPT_FASTEST "-O2")

    set(CMAKE_${COMPILER_LANGUAGE}_STACK_PROTECTION "-GS")

    # enable by default on MSVC
    set(CMAKE_${COMPILER_LANGUAGE}_POSITION_INDEPENDENT "")

    # GCC
  elseif(CMAKE_${COMPILER_LANGUAGE}_COMPILER_IS_GCC)

    set(CMAKE_${COMPILER_LANGUAGE}_WARNING_ALL "-Wall")
    set(CMAKE_${COMPILER_LANGUAGE}_DEBUGINFO_FLAGS "-g")

    set(CMAKE_${COMPILER_LANGUAGE}_OPT_NONE "-O0")
    set(CMAKE_${COMPILER_LANGUAGE}_OPT_NORMAL "-O2")
    set(CMAKE_${COMPILER_LANGUAGE}_OPT_AGGRESSIVE "-O3")
    set(CMAKE_${COMPILER_LANGUAGE}_OPT_FASTEST "-Ofast -march=native")

    set(CMAKE_${COMPILER_LANGUAGE}_STACK_PROTECTION "-fstack-protector")

    set(CMAKE_${COMPILER_LANGUAGE}_POSITION_INDEPENDENT "-fPIC")

    set(CMAKE_${COMPILER_LANGUAGE}_VECTORIZE "-ftree-vectorize")
    set(IGNORE_UNKNOWN_PRAGMA_FLAGS "-Wno-unknown-pragmas")

    if(CMAKE_${COMPILER_LANGUAGE}_COMPILER_VERSION VERSION_GREATER "4.7.0")
      set(CMAKE_${COMPILER_LANGUAGE}_LINK_TIME_OPT "-flto")
    endif()

    if((CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^ppc") OR (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^power"
                                                       ))
      # ppc arch do not support -march= syntax
      set(CMAKE_${COMPILER_LANGUAGE}_GEN_NATIVE "-mcpu=native")
    else()
      set(CMAKE_${COMPILER_LANGUAGE}_GEN_NATIVE "-march=native")
    endif()

    # CLANG
  elseif(CMAKE_${COMPILER_LANGUAGE}_COMPILER_IS_CLANG)
    set(CMAKE_${COMPILER_LANGUAGE}_WARNING_ALL "-Wall")
    set(CMAKE_${COMPILER_LANGUAGE}_DEBUGINFO_FLAGS "-g")

    set(CMAKE_${COMPILER_LANGUAGE}_OPT_NONE "-O0")
    set(CMAKE_${COMPILER_LANGUAGE}_OPT_NORMAL "-O2")
    set(CMAKE_${COMPILER_LANGUAGE}_OPT_AGGRESSIVE "-O3")
    set(CMAKE_${COMPILER_LANGUAGE}_OPT_FASTEST "-Ofast -march=native")

    set(CMAKE_${COMPILER_LANGUAGE}_STACK_PROTECTION "-fstack-protector")
    set(CMAKE_${COMPILER_LANGUAGE}_POSITION_INDEPENDENT "-fPIC")

    # Force same ld behavior as when called from gcc --as-needed forces the linker to check whether
    # a dynamic library mentioned in the command line is actually needed by the objects being
    # linked. Symbols needed in shared objects are already linked when building that library.
    set(CMAKE_EXE_LINKER_FLAGS "-Wl,--as-needed")
    set(CMAKE_SHARED_LINKER_FLAGS "-Wl,--as-needed")

    # rest of the world
  else()
    set(CMAKE_${COMPILER_LANGUAGE}_WARNING_ALL "-Wall")
    set(CMAKE_${COMPILER_LANGUAGE}_DEBUGINFO_FLAGS "-g")

    set(CMAKE_${COMPILER_LANGUAGE}_OPT_NONE "-O0")
    set(CMAKE_${COMPILER_LANGUAGE}_OPT_NORMAL "-O2")
    set(CMAKE_${COMPILER_LANGUAGE}_OPT_AGGRESSIVE "-O3")
    set(CMAKE_${COMPILER_LANGUAGE}_OPT_FASTEST "-O3")

    set(CMAKE_${COMPILER_LANGUAGE}_STACK_PROTECTION "")
    set(CMAKE_${COMPILER_LANGUAGE}_POSITION_INDEPENDENT "-fPIC")
    set(CMAKE_${COMPILER_LANGUAGE}_VECTORIZE "")

    if(CMAKE_${COMPILER_LANGUAGE}_COMPILER_IS_ICC)
      # unknown compiler flags produce error on Cray and hence just set this for intel now
      set(IGNORE_UNKNOWN_PRAGMA_FLAGS "-Wno-unknown-pragmas")
      # Intel O3 is extreme
      set(CMAKE_${COMPILER_LANGUAGE}_OPT_AGGRESSIVE "-O2")
    endif()

    if(CMAKE_${COMPILER_LANGUAGE}_COMPILER_ID STREQUAL "PGI")
      set(CMAKE_${COMPILER_LANGUAGE}_WARNING_ALL "")
    endif()
  endif()

endforeach()

# ===============================================================================
# Allow undefined reference in shared library as mod files will be linked later
# ===============================================================================
if(CMAKE_CXX_COMPILER_ID MATCHES "AppleClang" OR ${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
  set(UNDEFINED_SYMBOLS_IGNORE_FLAG "-undefined dynamic_lookup")
  string(APPEND CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS " ${UNDEFINED_SYMBOLS_IGNORE_FLAG}")
  string(APPEND CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS " ${UNDEFINED_SYMBOLS_IGNORE_FLAG}")
endif()


================================================
FILE: CMake/config/ReleaseDebugAutoFlags.cmake
================================================
# =============================================================================
# Copyright (C) 2016-2021 Blue Brain Project
#
# See top-level LICENSE file for details.
# =============================================================================

# ~~~
# ReleaseDebugAutoFlags.cmake
# Release / Debug configuration helper
# ~~~

# default configuration
if(NOT CMAKE_BUILD_TYPE AND (NOT CMAKE_CONFIGURATION_TYPES))
  set(CMAKE_BUILD_TYPE
      RelWithDebInfo
      CACHE STRING "Choose the type of build." FORCE)
  message(STATUS "Setting build type to '${CMAKE_BUILD_TYPE}' as none was specified.")
endif()

# =============================================================================
# Different build types
# =============================================================================
# ~~~
# Debug : Optimized for debugging, include debug symbols
# Release : Release mode, no debuginfo
# RelWithDebInfo : Distribution mode, basic optimizations for potable code with debuginfos
# Fast : Maximum level of optimization. Target native architecture, not portable code
# ~~~

include(CompilerFlagsHelpers)

# ~~~
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_OPT_NORMAL}")
set(CMAKE_C_FLAGS_DEBUG
    "${CMAKE_C_DEBUGINFO_FLAGS}  ${CMAKE_C_OPT_NONE} ${CMAKE_C_STACK_PROTECTION}")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_DEBUGINFO_FLAGS}  ${CMAKE_C_OPT_NORMAL}")
set(CMAKE_C_FLAGS_FAST " ${CMAKE_C_OPT_FASTEST} ${CMAKE_C_LINK_TIME_OPT} ${CMAKE_C_GEN_NATIVE}")

set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_OPT_NORMAL}")
set(CMAKE_CXX_FLAGS_DEBUG
    "${CMAKE_CXX_DEBUGINFO_FLAGS}  ${CMAKE_CXX_OPT_NONE} ${CMAKE_CXX_STACK_PROTECTION}")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_DEBUGINFO_FLAGS}  ${CMAKE_CXX_OPT_NORMAL}")
set(CMAKE_CXX_FLAGS_FAST
    " ${CMAKE_CXX_OPT_FASTEST} ${CMAKE_CXX_LINK_TIME_OPT} ${CMAKE_CXX_GEN_NATIVE}")
# ~~~


================================================
FILE: CMake/config/SetRpath.cmake
================================================
# =============================================================================
# Copyright (C) 2016-2021 Blue Brain Project
#
# See top-level LICENSE file for details.
# =============================================================================

# enable @rpath in the install name for any shared library being built
set(CMAKE_MACOSX_RPATH 1)

# ~~~
# On platforms like bgq, xlc didn't like rpath with static build and similar
# issue was seen on Cray
# ~~~
if(NOT CRAY_SYSTEM)
  # use, i.e. don't skip the full RPATH for the build tree
  set(CMAKE_SKIP_BUILD_RPATH FALSE)

  # when building, don't use the install RPATH already but later on when installing
  set(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE)

  # ~~~
  # add the automatically determined parts of the RPATH which point to directories
  # outside the build tree to the install RPATH
  # ~~~
  set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)

  set(LIB_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/lib")

  # the RPATH to be used when installing, but only if it's not a system directory
  list(FIND CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES "${LIB_INSTALL_DIR}" isSystemDir)
  if("${isSystemDir}" STREQUAL "-1")
    set(CMAKE_INSTALL_RPATH "${LIB_INSTALL_DIR}")
  endif("${isSystemDir}" STREQUAL "-1")
endif()


================================================
FILE: CMake/config/TestHelpers.cmake
================================================
# =============================================================================
# Copyright (C) 2016-2021 Blue Brain Project
#
# See top-level LICENSE file for details.
# =============================================================================

# ~~~
# TestHelpers.cmake
# set of Convenience functions for unit testing with cmake
# ~~~

# enable or disable detection of SLURM and MPIEXEC
option(AUTO_TEST_WITH_SLURM "Add srun as test prefix in a SLURM environment" TRUE)
option(AUTO_TEST_WITH_MPIEXEC "Add mpiexec as test prefix in a MPICH2/OpenMPI environment" TRUE)

# ~~~
# Basic SLURM support the prefix "srun" is added to any test in the environment/ For a
# slurm test execution, simply run "salloc [your_exec_parameters] ctest"
# ~~~
if(AUTO_TEST_WITH_SLURM)
  if(NOT DEFINED SLURM_SRUN_COMMAND)
    find_program(
      SLURM_SRUN_COMMAND
      NAMES "srun"
      HINTS "${SLURM_ROOT}/bin" QUIET)
  endif()

  if(SLURM_SRUN_COMMAND)
    set(TEST_EXEC_PREFIX_DEFAULT "${SLURM_SRUN_COMMAND}")
    set(TEST_MPI_EXEC_PREFIX_DEFAULT "${SLURM_SRUN_COMMAND}")
    set(TEST_MPI_EXEC_BIN_DEFAULT "${SLURM_SRUN_COMMAND}")
    set(TEST_WITH_SLURM ON)
  endif()

endif()

# Basic mpiexec support, will just forward mpiexec as prefix
if(AUTO_TEST_WITH_MPIEXEC AND NOT TEST_WITH_SLURM)
  if(NOT DEFINED MPIEXEC)
    find_program(
      MPIEXEC
      NAMES "mpiexec"
      HINTS "${MPI_ROOT}/bin")
  endif()

  if(MPIEXEC)
    set(TEST_MPI_EXEC_PREFIX_DEFAULT "${MPIEXEC}")
    set(TEST_MPI_EXEC_BIN_DEFAULT "${MPIEXEC}")
    set(TEST_WITH_MPIEXEC ON)
  endif()
endif()

# ~~~
# MPI executor program path without arguments used for testing.
# default: srun or mpiexec if found
# ~~~
set(TEST_MPI_EXEC_BIN
    "${TEST_MPI_EXEC_BIN_DEFAULT}"
    CACHE STRING "path of the MPI executor (mpiexec, mpirun) for test execution")

# ~~~
# Test execution prefix. Override this variable for any execution prefix required
# in clustered environment
#
# To specify manually a command with argument, e.g -DTEST_EXEC_PREFIX="/usr/bin/srun;-n;-4"
# for a srun execution with 4 nodes
#
# default: srun if found
# ~~~
set(TEST_EXEC_PREFIX
    "${TEST_EXEC_PREFIX_DEFAULT}"
    CACHE STRING "prefix command for the test executions")

# ~~~
# Test execution prefix specific for MPI programs.
#
# To specify manually a command with argument, use the cmake list syntax. e.g
# -DTEST_EXEC_PREFIX="/usr/bin/mpiexec;-n;-4" for an MPI execution with 4 nodes
#
# default: srun or mpiexec if found
# ~~~
set(TEST_MPI_EXEC_PREFIX
    "${TEST_MPI_EXEC_PREFIX_DEFAULT}"
    CACHE STRING "prefix command for the MPI test executions")


================================================
FILE: CMake/coreneuron-config.cmake.in
================================================
# =============================================================================
# Copyright (C) 2016-2022 Blue Brain Project
#
# See top-level LICENSE file for details.
# =============================================================================

# coreneuron-config.cmake - package configuration file

get_filename_component(CONFIG_PATH "${CMAKE_CURRENT_LIST_FILE}" PATH)

set(CORENRN_VERSION_MAJOR @PROJECT_VERSION_MAJOR@)
set(CORENRN_VERSION_MINOR @PROJECT_VERSION_MINOR@)
set(CORENRN_VERSION_PATCH @PROJECT_VERSION_PATCH@)
set(CORENRN_ENABLE_GPU @CORENRN_ENABLE_GPU@)
set(CORENRN_ENABLE_NMODL @CORENRN_ENABLE_NMODL@)
set(CORENRN_ENABLE_REPORTING @CORENRN_ENABLE_REPORTING@)
set(CORENRN_ENABLE_SHARED @CORENRN_ENABLE_SHARED@)
set(CORENRN_LIB_LINK_FLAGS "@CORENRN_LIB_LINK_FLAGS@")
set(CORENRN_NEURON_LINK_FLAGS "@CORENRN_NEURON_LINK_FLAGS@")

find_path(CORENEURON_INCLUDE_DIR "coreneuron/coreneuron.h" HINTS "${CONFIG_PATH}/../../include")
find_path(
  CORENEURON_LIB_DIR
  NAMES libcorenrnmech.a libcorenrnmech.so libcorenrnmech.dylib
  HINTS "${CONFIG_PATH}/../../lib")

include(${CONFIG_PATH}/coreneuron.cmake)


================================================
FILE: CMake/packages/FindSphinx.cmake
================================================
# =============================================================================
# Copyright (C) 2016-2021 Blue Brain Project
#
# See top-level LICENSE file for details.
# =============================================================================

find_program(
  SPHINX_EXECUTABLE
  NAMES sphinx-build
  DOC "/path/to/sphinx-build")

include(FindPackageHandleStandardArgs)

find_package_handle_standard_args(Sphinx "Failed to find sphinx-build executable" SPHINX_EXECUTABLE)


================================================
FILE: CMake/packages/Findlikwid.cmake
================================================
# =============================================================================
# Copyright (C) 2016-2021 Blue Brain Project
#
# See top-level LICENSE file for details.
# =============================================================================

# ~~~
# Findlikwid
# -------------
#
# Find likwid
#
# Find the likwid RRZE Performance Monitoring and Benchmarking Suite
#
# Using likwid:
#
# ::
#   set(LIKWID_DIR "" CACHE PATH "Path likwid performance monitoring and benchmarking suite")
#   find_package(likwid REQUIRED)
#   include_directories(${likwid_INCLUDE_DIRS})
#   target_link_libraries(foo ${likwid_LIBRARIES})
#
# This module sets the following variables:
#
# ::
#
#   likwid_FOUND     - set to true if the library is found
#   likwid_INCLUDE   - list of required include directories
#   likwid_LIBRARIES - list of required library directories
# ~~~

find_path(likwid_INCLUDE_DIRS "likwid.h" HINTS "${LIKWID_DIR}/include")
find_library(likwid_LIBRARIES likwid HINTS "${LIKWID_DIR}/lib")

# Checks 'REQUIRED', 'QUIET' and versions.
include(FindPackageHandleStandardArgs)

find_package_handle_standard_args(likwid REQUIRED_VARS likwid_INCLUDE_DIRS likwid_LIBRARIES)


================================================
FILE: CMake/packages/Findnmodl.cmake
================================================
# =============================================================================
# Copyright (C) 2016-2021 Blue Brain Project
#
# See top-level LICENSE file for details.
# =============================================================================

# ~~~
# Findnmodl
# -------------
#
# Find nmodl
#
# Find the nmodl Blue Brain HPC utils library
#
# Using nmodl:
#
# ::
#   set(CORENRN_NMODL_DIR "" CACHE PATH "Path to nmodl source-to-source compiler installation")
#   find_package(nmodl REQUIRED)
#   include_directories(${nmodl_INCLUDE_DIRS})
#   target_link_libraries(foo ${nmodl_LIBRARIES})
#
# This module sets the following variables:
#
# ::
#
#   nmodl_FOUND   - set to true if the library is found
#   nmodl_INCLUDE - list of required include directories
#   nmodl_BINARY  - the nmodl binary
# ~~~

# UNIX paths are standard, no need to write.
find_program(
  nmodl_BINARY
  NAMES nmodl${CMAKE_EXECUTABLE_SUFFIX}
  HINTS "${CORENRN_NMODL_DIR}/bin" QUIET)

find_path(nmodl_INCLUDE "nmodl/fast_math.hpp" HINTS "${CORENRN_NMODL_DIR}/include")
find_path(nmodl_PYTHONPATH "nmodl/__init__.py" HINTS "${CORENRN_NMODL_DIR}/lib")

# Checks 'REQUIRED', 'QUIET' and versions.
include(FindPackageHandleStandardArgs)

find_package_handle_standard_args(
  nmodl
  FOUND_VAR nmodl_FOUND
  REQUIRED_VARS nmodl_BINARY nmodl_INCLUDE nmodl_PYTHONPATH)


================================================
FILE: CMake/packages/Findreportinglib.cmake
================================================
# =============================================================================
# Copyright (C) 2016-2021 Blue Brain Project
#
# See top-level LICENSE file for details.
# =============================================================================

# ~~~
# Findreportinglib
# -------------
#
# Find reportinglib
#
# Find the reportinglib Blue Brain HPC utils library
#
# Using reportinglib:
#
# ::
#
#   find_package(reportinglib REQUIRED)
#   include_directories(${reportinglib_INCLUDE_DIRS})
#   target_link_libraries(foo ${reportinglib_LIBRARIES})
#
# This module sets the following variables:
#
# ::
#
#   reportinglib_FOUND - set to true if the library is found
#   reportinglib_INCLUDE_DIRS - list of required include directories
#   reportinglib_LIBRARIES - list of libraries to be linked
# ~~~

# UNIX paths are standard, no need to write.
find_path(reportinglib_INCLUDE_DIR reportinglib/Report.h)
find_library(reportinglib_LIBRARY reportinglib)
get_filename_component(reportinglib_LIB_DIR ${reportinglib_LIBRARY} DIRECTORY)
find_program(reportinglib_somaDump somaDump ${reportinglib_LIB_DIR}/../bin)

# Checks 'REQUIRED', 'QUIET' and versions.
include(FindPackageHandleStandardArgs)

find_package_handle_standard_args(
  reportinglib
  FOUND_VAR reportinglib_FOUND
  REQUIRED_VARS reportinglib_INCLUDE_DIR reportinglib_LIBRARY reportinglib_LIB_DIR)


================================================
FILE: CMakeLists.txt
================================================
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
cmake_minimum_required(VERSION 3.15 FATAL_ERROR)
# CoreNEURON's version jumped from 1.0 to 8.2.0 with the introduction of the NRN_VERSION_* macros
# for use in VERBATIM blocks. Starting from this version, the NEURON and CoreNEURON versions are
# locked together. A version has to be hardcoded here to handle the case that CoreNEURON is built
# standalone.
project(
  coreneuron
  VERSION 9.0.0
  LANGUAGES CXX)

# ~~~
# It is a bad idea having floating point versions, since macros cant handle them
# We therefore, have version as an int, which is pretty much standard
# ~~~
math(EXPR CORENEURON_VERSION_COMBINED
     "${coreneuron_VERSION_MAJOR} * 100 + ${coreneuron_VERSION_MINOR}")

# =============================================================================
# CMake common project settings
# =============================================================================
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_BUILD_TYPE
    RelWithDebInfo
    CACHE STRING "Empty or one of Debug, Release, RelWithDebInfo")

if(NOT "cxx_std_17" IN_LIST CMAKE_CXX_COMPILE_FEATURES)
  message(
    FATAL_ERROR
      "This compiler does not fully support C++17, choose a higher version or another compiler.")
endif()

# =============================================================================
# Settings to enable project as submodule
# =============================================================================
set(CORENEURON_PROJECT_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
set(CORENEURON_PROJECT_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
set(CORENEURON_AS_SUBPROJECT OFF)
if(NOT CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
  set(CORENEURON_AS_SUBPROJECT ON)
  # Make these visible to the parent project (NEURON) so it can do some sanity checking.
  set_property(GLOBAL PROPERTY CORENRN_VERSION_MAJOR ${PROJECT_VERSION_MAJOR})
  set_property(GLOBAL PROPERTY CORENRN_VERSION_MINOR ${PROJECT_VERSION_MINOR})
  set_property(GLOBAL PROPERTY CORENRN_VERSION_PATCH ${PROJECT_VERSION_PATCH})
endif()
if(NOT DEFINED NRN_VERSION_MAJOR
   OR NOT DEFINED NRN_VERSION_MINOR
   OR NOT DEFINED NRN_VERSION_PATCH)
  if(CORENEURON_AS_SUBPROJECT)
    set(level WARNING)
  else()
    set(level STATUS)
  endif()
  # Typically in this case CoreNEURON is being built standalone. In this case NRN_VERSION_* macros
  # resolve to the CoreNEURON version, which is supposed to be moving in lockstep with the NEURON
  # version.
  set(NRN_VERSION_MAJOR ${PROJECT_VERSION_MAJOR})
  set(NRN_VERSION_MINOR ${PROJECT_VERSION_MINOR})
  set(NRN_VERSION_PATCH ${PROJECT_VERSION_PATCH})
  message(${level} "CoreNEURON could not determine the NEURON version, using the hardcoded "
          "${NRN_VERSION_MAJOR}.${NRN_VERSION_MINOR}.${NRN_VERSION_PATCH}")
endif()
# Regardless of whether we are being built as a submodule of NEURON, NRN_VERSION_{MAJOR,MINOR,PATCH}
# are now set to the version that we should claim compatibility with when compiling translated MOD
# files. Generate a header under a special `generated` prefix in the build directory, so that
# -I/path/to/src -I/path/to/build/generated is safe (headers from the source prefix are copied
# elsewhere under the build prefix, so there is scope for confusion)
configure_file(coreneuron/config/neuron_version.hpp.in
               generated/coreneuron/config/neuron_version.hpp)

# =============================================================================
# Include cmake modules path
# =============================================================================
list(APPEND CMAKE_MODULE_PATH ${CORENEURON_PROJECT_SOURCE_DIR}/CMake
     ${CORENEURON_PROJECT_SOURCE_DIR}/CMake/packages ${CORENEURON_PROJECT_SOURCE_DIR}/CMake/config)

# =============================================================================
# HPC Coding Conventions
# =============================================================================
set(CODING_CONV_PREFIX "CORENRN")
set(CORENRN_3RDPARTY_DIR "external")
include(AddHpcCodingConvSubmodule)
add_subdirectory(CMake/hpc-coding-conventions/cpp)

# =============================================================================
# Enable sanitizer support if the CORENRN_SANITIZERS variable is set
# =============================================================================
include(CMake/hpc-coding-conventions/cpp/cmake/sanitizers.cmake)
set(CORENRN_EXTRA_CXX_FLAGS
    ""
    CACHE STRING "Add extra compile flags for CoreNEURON sources")
separate_arguments(CORENRN_EXTRA_CXX_FLAGS)
set(CORENRN_EXTRA_MECH_CXX_FLAGS
    ""
    CACHE STRING "Add extra compile flags for translated mechanisms")
separate_arguments(CORENRN_EXTRA_MECH_CXX_FLAGS)
list(APPEND CORENRN_EXTRA_CXX_FLAGS ${CORENRN_SANITIZER_COMPILER_FLAGS})
list(APPEND CORENRN_EXTRA_MECH_CXX_FLAGS ${CORENRN_SANITIZER_COMPILER_FLAGS})
list(APPEND CORENRN_EXTRA_LINK_FLAGS ${CORENRN_SANITIZER_COMPILER_FLAGS})

# =============================================================================
# Include common cmake modules
# =============================================================================
include(CheckIncludeFiles)
include(ReleaseDebugAutoFlags)
include(CrayPortability)
include(SetRpath)
include(CTest)
include(AddRandom123Submodule)
include(GitRevision)

set(CORENRN_3RDPARTY_DIR external)
include(CMake/hpc-coding-conventions/cpp/cmake/3rdparty.cmake)
cpp_cc_git_submodule(CLI11 BUILD PACKAGE CLI11 REQUIRED)

# =============================================================================
# Build options
# =============================================================================
option(CORENRN_ENABLE_OPENMP "Build the CORE NEURON with OpenMP implementation" ON)
option(CORENRN_ENABLE_OPENMP_OFFLOAD "Prefer OpenMP target offload to OpenACC" ON)
option(CORENRN_ENABLE_TIMEOUT "Enable nrn_timeout implementation" ON)
option(CORENRN_ENABLE_REPORTING "Enable use of ReportingLib for soma reports" OFF)
option(CORENRN_ENABLE_MPI "Enable MPI-based execution" ON)
option(CORENRN_ENABLE_MPI_DYNAMIC "Enable dynamic MPI support" OFF)
option(CORENRN_ENABLE_HOC_EXP "Enable wrapping exp with hoc_exp()" OFF)
option(CORENRN_ENABLE_SPLAYTREE_QUEUING "Enable use of Splay tree for spike queuing" ON)
option(CORENRN_ENABLE_NET_RECEIVE_BUFFER "Enable event buffering in net_receive function" ON)
option(CORENRN_ENABLE_NMODL "Enable external nmodl source-to-source compiler" OFF)
option(CORENRN_ENABLE_CALIPER_PROFILING "Enable Caliper instrumentation" OFF)
option(CORENRN_ENABLE_LIKWID_PROFILING "Enable LIKWID instrumentation" OFF)
option(CORENRN_ENABLE_CUDA_UNIFIED_MEMORY "Enable CUDA unified memory support" OFF)
option(CORENRN_ENABLE_UNIT_TESTS "Enable unit tests execution" ON)
option(CORENRN_ENABLE_GPU "Enable GPU support using OpenACC or OpenMP" OFF)
option(CORENRN_ENABLE_SHARED "Enable shared library build" ON)
option(CORENRN_ENABLE_LEGACY_UNITS "Enable legacy FARADAY, R, etc" OFF)
option(CORENRN_ENABLE_PRCELLSTATE "Enable NRN_PRCELLSTATE debug feature" OFF)

set(CORENRN_NMODL_DIR
    ""
    CACHE PATH "Path to nmodl source-to-source compiler installation")
set(LIKWID_DIR
    ""
    CACHE PATH "Path to likwid performance analysis suite")

# Older CMake versions label NVHPC as PGI, newer ones label it as NVHPC.
if(${CMAKE_CXX_COMPILER_ID} STREQUAL "PGI" OR ${CMAKE_CXX_COMPILER_ID} STREQUAL "NVHPC")
  set(CORENRN_HAVE_NVHPC_COMPILER ON)
else()
  set(CORENRN_HAVE_NVHPC_COMPILER OFF)
endif()

set(CORENRN_ACCELERATOR_OFFLOAD "Disabled")
if(CORENRN_ENABLE_GPU)
  # Older CMake versions than 3.15 have not been tested for GPU/CUDA/OpenACC support after
  # https://github.com/BlueBrain/CoreNeuron/pull/609.

  # Fail hard and early if we don't have the PGI/NVHPC compiler.
  if(NOT CORENRN_HAVE_NVHPC_COMPILER)
    message(
      FATAL_ERROR
        "GPU support is available via OpenACC using PGI/NVIDIA compilers."
        " Use NVIDIA HPC SDK with -DCMAKE_C_COMPILER=nvc -DCMAKE_CUDA_COMPILER=nvcc -DCMAKE_CXX_COMPILER=nvc++"
    )
  endif()

  # Set some sensible default CUDA architectures.
  if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
    set(CMAKE_CUDA_ARCHITECTURES 70 80)
    message(STATUS "Setting default CUDA architectures to ${CMAKE_CUDA_ARCHITECTURES}")
  endif()

  # See https://gitlab.kitware.com/cmake/cmake/-/issues/23081, this should not be needed according
  # to the CMake documentation, but it is not clear that any version behaves as documented.
  if(DEFINED CMAKE_CUDA_HOST_COMPILER)
    unset(ENV{CUDAHOSTCXX})
  endif()

  # Enable CUDA language support.
  enable_language(CUDA)

  # Prefer shared libcudart.so
  if(${CMAKE_VERSION} VERSION_LESS 3.17)
    # Ugly workaround from https://gitlab.kitware.com/cmake/cmake/-/issues/17559, remove when
    # possible
    if(CMAKE_CUDA_HOST_IMPLICIT_LINK_LIBRARIES)
      list(REMOVE_ITEM CMAKE_CUDA_HOST_IMPLICIT_LINK_LIBRARIES "cudart_static")
      list(REMOVE_ITEM CMAKE_CUDA_HOST_IMPLICIT_LINK_LIBRARIES "cudadevrt")
      list(APPEND CMAKE_CUDA_HOST_IMPLICIT_LINK_LIBRARIES "cudart")
    endif()
    if(CMAKE_CUDA_IMPLICIT_LINK_LIBRARIES)
      list(REMOVE_ITEM CMAKE_CUDA_IMPLICIT_LINK_LIBRARIES "cudart_static")
      list(REMOVE_ITEM CMAKE_CUDA_IMPLICIT_LINK_LIBRARIES "cudadevrt")
      list(APPEND CMAKE_CUDA_IMPLICIT_LINK_LIBRARIES "cudart")
    endif()
  else()
    # nvc++ -cuda implicitly links dynamically to libcudart.so. Setting this makes sure that CMake
    # does not add -lcudart_static and trigger errors due to mixed dynamic/static linkage.
    set(CMAKE_CUDA_RUNTIME_LIBRARY Shared)
  endif()

  # Patch CUDA_ARCHITECTURES support into older CMake versions
  if(${CMAKE_VERSION} VERSION_LESS 3.18)
    foreach(cuda_arch ${CMAKE_CUDA_ARCHITECTURES})
      string(
        APPEND CMAKE_CUDA_FLAGS
        " --generate-code=arch=compute_${cuda_arch},code=[compute_${cuda_arch},sm_${cuda_arch}]")
    endforeach()
  endif()

  # ~~~
  # Needed for the Eigen GPU support Warning suppression (Eigen GPU-related):
  # 3057 : Warning on ignoring __host__ annotation in some functions
  # 3085 : Warning on redeclaring a __host__ function as __host__ __device__
  # ~~~
  set(CMAKE_CUDA_FLAGS
      "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr -Xcudafe --diag_suppress=3057,--diag_suppress=3085"
  )

  if(CORENRN_ENABLE_NMODL)
    # NMODL supports both OpenACC and OpenMP target offload
    if(CORENRN_ENABLE_OPENMP AND CORENRN_ENABLE_OPENMP_OFFLOAD)
      set(CORENRN_ACCELERATOR_OFFLOAD "OpenMP")
    else()
      set(CORENRN_ACCELERATOR_OFFLOAD "OpenACC")
    endif()
  else()
    # MOD2C only supports OpenACC offload
    set(CORENRN_ACCELERATOR_OFFLOAD "OpenACC")
  endif()
endif()

# =============================================================================
# Project version from git and project directories
# =============================================================================
set(CN_PROJECT_VERSION ${PROJECT_VERSION})

# generate file with version number from git and nrnunits.lib file path
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/coreneuron/config/config.cpp.in
               ${PROJECT_BINARY_DIR}/coreneuron/config/config.cpp @ONLY)

# =============================================================================
# Include cmake modules after cmake options
# =============================================================================
include(OpenAccHelper)

# =============================================================================
# Common dependencies
# =============================================================================
find_package(PythonInterp REQUIRED)
find_package(Perl REQUIRED)

# =============================================================================
# Common build options
# =============================================================================
# build mod files for coreneuron
list(APPEND CORENRN_COMPILE_DEFS CORENEURON_BUILD)
set(CMAKE_REQUIRED_QUIET TRUE)
check_include_files(malloc.h have_malloc_h)
if(have_malloc_h)
  list(APPEND CORENRN_COMPILE_DEFS HAVE_MALLOC_H)
endif()

# =============================================================================
# Build option specific compiler flags
# =============================================================================
if(CORENRN_ENABLE_NMODL)
  # We use Eigen for "small" matrices with thread-level parallelism handled at a higher level; tell
  # Eigen not to try to multithread internally
  list(APPEND CORENRN_COMPILE_DEFS EIGEN_DONT_PARALLELIZE)
endif()
if(CORENRN_HAVE_NVHPC_COMPILER)
  # PGI with llvm code generation doesn't have necessary assembly intrinsic headers
  list(APPEND CORENRN_COMPILE_DEFS EIGEN_DONT_VECTORIZE=1)
  if(NOT CORENRN_ENABLE_GPU AND ${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL 21.11)
    # Random123 does not play nicely with NVHPC 21.11+'s detection of ABM features if it detects the
    # compiler to be PGI or NVHPC, see: https://github.com/BlueBrain/CoreNeuron/issues/724 and
    # https://github.com/DEShawResearch/random123/issues/6. In fact in GPU builds Random123
    # (mis)detects nvc++ as nvcc because we pass the -cuda option and we therefore avoid the
    # problem. If GPU support is disabled, we define R123_USE_INTRIN_H=0 to avoid the problem.
    list(APPEND CORENRN_COMPILE_DEFS R123_USE_INTRIN_H=0)
  endif()
  # CMake versions <3.19 used to add -A when using NVHPC/PGI, which makes the compiler excessively
  # pedantic. See https://gitlab.kitware.com/cmake/cmake/-/issues/20997.
  if(CMAKE_VERSION VERSION_LESS 3.19)
    list(REMOVE_ITEM CMAKE_CXX17_STANDARD_COMPILE_OPTION -A)
  endif()
endif()

if(CORENRN_ENABLE_SHARED)
  set(COMPILE_LIBRARY_TYPE "SHARED")
else()
  set(COMPILE_LIBRARY_TYPE "STATIC")
endif()

if(CORENRN_ENABLE_MPI)
  find_package(MPI REQUIRED)
  list(APPEND CORENRN_COMPILE_DEFS NRNMPI=1)
  # avoid linking to C++ bindings
  list(APPEND CORENRN_COMPILE_DEFS MPI_NO_CPPBIND=1)
  list(APPEND CORENRN_COMPILE_DEFS OMPI_SKIP_MPICXX=1)
  list(APPEND CORENRN_COMPILE_DEFS MPICH_SKIP_MPICXX=1)
else()
  list(APPEND CORENRN_COMPILE_DEFS NRNMPI=0)
  list(APPEND CORENRN_COMPILE_DEFS NRN_MULTISEND=0)
endif()

if(CORENRN_ENABLE_OPENMP)
  find_package(OpenMP QUIET)
  if(OPENMP_FOUND)
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS} ${ADDITIONAL_THREADSAFE_FLAGS}")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS} ${ADDITIONAL_THREADSAFE_FLAGS}")
  endif()
endif()

list(APPEND CORENRN_COMPILE_DEFS LAYOUT=0)

if(NOT CORENRN_ENABLE_HOC_EXP)
  list(APPEND CORENRN_COMPILE_DEFS DISABLE_HOC_EXP)
endif()

# splay tree required for net_move
if(CORENRN_ENABLE_SPLAYTREE_QUEUING)
  list(APPEND CORENRN_COMPILE_DEFS ENABLE_SPLAYTREE_QUEUING)
endif()

if(NOT CORENRN_ENABLE_NET_RECEIVE_BUFFER)
  list(APPEND CORENRN_COMPILE_DEFS NET_RECEIVE_BUFFERING=0)
endif()

if(NOT CORENRN_ENABLE_TIMEOUT)
  list(APPEND CORENRN_COMPILE_DEFS DISABLE_TIMEOUT)
endif()

if(CORENRN_ENABLE_REPORTING)
  find_package(reportinglib)
  find_package(sonata)
  find_program(H5DUMP_EXECUTABLE h5dump)

  if(reportinglib_FOUND)
    list(APPEND CORENRN_COMPILE_DEFS ENABLE_BIN_REPORTS)
    set(ENABLE_BIN_REPORTS_TESTS ON)
  else()
    set(reportinglib_INCLUDE_DIR "")
    set(reportinglib_LIBRARY "")
  endif()
  if(sonata_FOUND)
    if(TARGET sonata::sonata_report)
      list(APPEND CORENRN_COMPILE_DEFS ENABLE_SONATA_REPORTS)
      set(ENABLE_SONATA_REPORTS_TESTS ON)
    else()
      message(SEND_ERROR "SONATA library was found but without reporting support")
    endif()
  endif()

  if(NOT reportinglib_FOUND AND NOT sonata_FOUND)
    message(SEND_ERROR "Neither reportinglib nor SONATA libraries were found")
  endif()

  include_directories(${reportinglib_INCLUDE_DIR})
  include_directories(${sonatareport_INCLUDE_DIR})
endif()

if(CORENRN_ENABLE_LEGACY_UNITS)
  set(CORENRN_USE_LEGACY_UNITS 1)
else()
  set(CORENRN_USE_LEGACY_UNITS 0)
endif()
list(APPEND CORENRN_COMPILE_DEFS CORENEURON_USE_LEGACY_UNITS=${CORENRN_USE_LEGACY_UNITS})
# Propagate Legacy Units flag to backends.
set(MOD2C_ENABLE_LEGACY_UNITS
    ${CORENRN_ENABLE_LEGACY_UNITS}
    CACHE BOOL "" FORCE)
set(NMODL_ENABLE_LEGACY_UNITS
    ${CORENRN_ENABLE_LEGACY_UNITS}
    CACHE BOOL "" FORCE)

if(CORENRN_ENABLE_MPI_DYNAMIC)
  if(NOT CORENRN_ENABLE_MPI)
    message(FATAL_ERROR "Cannot enable dynamic mpi without mpi")
  endif()
  list(APPEND CORENRN_COMPILE_DEFS CORENEURON_ENABLE_MPI_DYNAMIC)
endif()

if(CORENRN_ENABLE_PRCELLSTATE)
  set(CORENRN_NRN_PRCELLSTATE 1)
else()
  set(CORENRN_NRN_PRCELLSTATE 0)
endif()
if(MINGW)
  list(APPEND CORENRN_COMPILE_DEFS MINGW)
endif()

# =============================================================================
# NMODL specific options
# =============================================================================
if(CORENRN_ENABLE_NMODL)
  find_package(nmodl)
  if(NOT "${CORENRN_NMODL_DIR}" STREQUAL "" AND NOT nmodl_FOUND)
    message(FATAL_ERROR "Cannot find NMODL in ${CORENRN_NMODL_DIR}")
  endif()
  if(nmodl_FOUND)
    set(CORENRN_MOD2CPP_BINARY ${nmodl_BINARY})
    set(CORENRN_MOD2CPP_INCLUDE ${nmodl_INCLUDE})
    # path to python interface
    set(ENV{PYTHONPATH} "${nmodl_PYTHONPATH}:$ENV{PYTHONPATH}")
    set(CORENRN_NMODL_PYTHONPATH $ENV{PYTHONPATH})
  else()
    set(NMODL_ENABLE_PYTHON_BINDINGS
        OFF
        CACHE BOOL "Disable NMODL python bindings")
    include(AddNmodlSubmodule)
    set(CORENRN_MOD2CPP_BINARY ${CMAKE_BINARY_DIR}/bin/nmodl${CMAKE_EXECUTABLE_SUFFIX})
    set(CORENRN_MOD2CPP_INCLUDE ${CMAKE_BINARY_DIR}/include)
    set(ENV{PYTHONPATH} "$ENV{PYTHONPATH}")
    set(nmodl_PYTHONPATH "${CMAKE_BINARY_DIR}/lib")
    set(CORENRN_NMODL_PYTHONPATH "${nmodl_PYTHONPATH}:$ENV{PYTHONPATH}")
    set(NMODL_TARGET_TO_DEPEND nmodl)
  endif()
  include_directories(${CORENRN_MOD2CPP_INCLUDE})
  # set correct arguments for nmodl for cpu/gpu target
  set(CORENRN_NMODL_FLAGS
      ""
      CACHE STRING "Extra NMODL options such as passes")
else()
  include(AddMod2cSubmodule)
  set(NMODL_TARGET_TO_DEPEND mod2c_core)
  set(CORENRN_MOD2CPP_BINARY ${CMAKE_BINARY_DIR}/bin/mod2c_core${CMAKE_EXECUTABLE_SUFFIX})
  set(CORENRN_MOD2CPP_INCLUDE ${CMAKE_BINARY_DIR}/include)
endif()

# =============================================================================
# Profiler/Instrumentation Options
# =============================================================================
if(CORENRN_ENABLE_CALIPER_PROFILING)
  find_package(caliper REQUIRED)
  list(APPEND CORENRN_COMPILE_DEFS CORENEURON_CALIPER)
  set(CORENRN_CALIPER_LIB caliper)
endif()

if(CORENRN_ENABLE_LIKWID_PROFILING)
  find_package(likwid REQUIRED)
  list(APPEND CORENRN_COMPILE_DEFS LIKWID_PERFMON)
  # TODO: avoid this part, probably by using some likwid CMake target
  include_directories(${likwid_INCLUDE_DIRS})
endif()

# enable debugging code with extra logs to stdout
if(CORENRN_ENABLE_DEBUG_CODE)
  list(APPEND CORENRN_COMPILE_DEFS CORENRN_DEBUG CHKPNTDEBUG CORENRN_DEBUG_QUEUE INTERLEAVE_DEBUG)
endif()

# =============================================================================
# Common CXX flags : ignore unknown pragma warnings
# =============================================================================
# Do not set this when building wheels. The nrnivmodl workflow means that we do not know what
# compiler will be invoked with these flags, so we have to use flags that are as generic as
# possible.
if(NOT DEFINED NRN_WHEEL_BUILD OR NOT NRN_WHEEL_BUILD)
  list(APPEND CORENRN_EXTRA_CXX_FLAGS "${IGNORE_UNKNOWN_PRAGMA_FLAGS}")
endif()

# Add the main source directory
add_subdirectory(coreneuron)

# Extract the various compiler option strings to use inside nrnivmodl-core. Sets the global property
# CORENRN_LIB_LINK_FLAGS, which contains the arguments that must be added to the link line for
# `special` to link against `libcorenrnmech.{a,so}`
include(MakefileBuildOptions)

# Generate the nrnivmodl-core script and makefile using the options from MakefileBuildOptions
add_subdirectory(extra)

if(CORENRN_ENABLE_UNIT_TESTS)
  add_subdirectory(tests)
endif()

# =============================================================================
# Install cmake modules
# =============================================================================
get_property(CORENRN_NEURON_LINK_FLAGS GLOBAL PROPERTY CORENRN_NEURON_LINK_FLAGS)
configure_file(CMake/coreneuron-config.cmake.in CMake/coreneuron-config.cmake @ONLY)
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/CMake/coreneuron-config.cmake" DESTINATION share/cmake)
install(EXPORT coreneuron DESTINATION share/cmake)

if(NOT CORENEURON_AS_SUBPROJECT)
  # =============================================================================
  # Setup Doxygen documentation
  # =============================================================================
  find_package(Doxygen QUIET)
  if(DOXYGEN_FOUND)
    # generate Doxyfile with correct source paths
    configure_file(${PROJECT_SOURCE_DIR}/docs/Doxyfile.in ${PROJECT_BINARY_DIR}/Doxyfile)
    add_custom_target(
      doxygen
      COMMAND ${DOXYGEN_EXECUTABLE} ${PROJECT_BINARY_DIR}/Doxyfile
      WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
      COMMENT "Generating API documentation with Doxygen"
      VERBATIM)
  endif()

  # =============================================================================
  # Setup Sphinx documentation
  # =============================================================================
  find_package(Sphinx QUIET)
  if(SPHINX_FOUND)
    set(SPHINX_SOURCE ${PROJECT_SOURCE_DIR}/docs)
    set(SPHINX_BUILD ${PROJECT_BINARY_DIR}/docs/)

    add_custom_target(
      sphinx
      COMMAND ${SPHINX_EXECUTABLE} -b html ${SPHINX_SOURCE} ${SPHINX_BUILD}
      WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
      COMMENT "Generating documentation with Sphinx")
  endif()

  # =============================================================================
  # Build full docs
  # =============================================================================
  if(DOXYGEN_FOUND AND SPHINX_FOUND)
    add_custom_target(
      docs
      COMMAND ${CMAKE_COMMAND} --build ${PROJECT_BINARY_DIR} --target doxygen
      COMMAND ${CMAKE_COMMAND} --build ${PROJECT_BINARY_DIR} --target sphinx
      COMMENT "Generating full documentation")
  else()
    add_custom_target(
      docs
      VERBATIM
      COMMAND echo "Please install docs requirements (see docs/README.md)!"
      COMMENT "Documentation generation not possible!")
  endif()
endif()
# =============================================================================
# Build status
# =============================================================================
message(STATUS "")
message(STATUS "Configured CoreNEURON ${PROJECT_VERSION}")
message(STATUS "")
message(STATUS "You can now build CoreNEURON using:")
message(STATUS "  cmake --build . --parallel 8 [--target TARGET]")
message(STATUS "You might want to adjust the number of parallel build jobs for your system.")
message(STATUS "Some non-default targets you might want to build:")
message(STATUS "--------------------+--------------------------------------------------------")
message(STATUS " Target             |   Description")
message(STATUS "--------------------+--------------------------------------------------------")
message(STATUS "install             | Will install CoreNEURON to: ${CMAKE_INSTALL_PREFIX}")
message(STATUS "docs                | Build full docs. Calls targets: doxygen, sphinx")
message(STATUS "--------------------+--------------------------------------------------------")
message(STATUS " Build option       | Status")
message(STATUS "--------------------+--------------------------------------------------------")
message(STATUS "CXX COMPILER        | ${CMAKE_CXX_COMPILER}")
message(STATUS "COMPILE FLAGS       | ${CORENRN_CXX_FLAGS}")
message(STATUS "Build Type          | ${COMPILE_LIBRARY_TYPE}")
message(STATUS "MPI                 | ${CORENRN_ENABLE_MPI}")
if(CORENRN_ENABLE_MPI)
  message(STATUS "  DYNAMIC           | ${CORENRN_ENABLE_MPI_DYNAMIC}")
  if(CORENRN_ENABLE_MPI_DYNAMIC AND NRN_MPI_LIBNAME_LIST)
    # ~~~
    # for dynamic mpi, rely on neuron for list of libraries to build
    # this is to avoid cmake code duplication on the coreneuron side
    # ~~~
    list(LENGTH NRN_MPI_LIBNAME_LIST _num_mpi)
    math(EXPR num_mpi "${_num_mpi} - 1")
    foreach(val RANGE ${num_mpi})
      list(GET NRN_MPI_LIBNAME_LIST ${val} libname)
      list(GET NRN_MPI_INCLUDE_LIST ${val} include)
      message(STATUS "    LIBNAME         | core${libname}")
      message(STATUS "    INC             | ${include}")
    endforeach(val)
  else()
    message(STATUS "  INC               | ${MPI_CXX_INCLUDE_PATH}")
  endif()
endif()
message(STATUS "OpenMP              | ${CORENRN_ENABLE_OPENMP}")
message(STATUS "Use legacy units    | ${CORENRN_ENABLE_LEGACY_UNITS}")
message(STATUS "NMODL               | ${CORENRN_ENABLE_NMODL}")
if(CORENRN_ENABLE_NMODL)
  message(STATUS "  FLAGS             | ${CORENRN_NMODL_FLAGS}")
endif()
message(STATUS "MOD2CPP PATH        | ${CORENRN_MOD2CPP_BINARY}")
message(STATUS "GPU Support         | ${CORENRN_ENABLE_GPU}")
if(CORENRN_ENABLE_GPU)
  message(STATUS "  CUDA              | ${CUDAToolkit_LIBRARY_DIR}")
  message(STATUS "  Offload           | ${CORENRN_ACCELERATOR_OFFLOAD}")
  message(STATUS "  Unified Memory    | ${CORENRN_ENABLE_CUDA_UNIFIED_MEMORY}")
endif()
message(STATUS "Auto Timeout        | ${CORENRN_ENABLE_TIMEOUT}")
message(STATUS "Wrap exp()          | ${CORENRN_ENABLE_HOC_EXP}")
message(STATUS "SplayTree Queue     | ${CORENRN_ENABLE_SPLAYTREE_QUEUING}")
message(STATUS "NetReceive Buffer   | ${CORENRN_ENABLE_NET_RECEIVE_BUFFER}")
message(STATUS "Caliper             | ${CORENRN_ENABLE_CALIPER_PROFILING}")
message(STATUS "Likwid              | ${CORENRN_ENABLE_LIKWID_PROFILING}")
message(STATUS "Unit Tests          | ${CORENRN_ENABLE_UNIT_TESTS}")
message(STATUS "Reporting           | ${CORENRN_ENABLE_REPORTING}")
if(CORENRN_ENABLE_REPORTING)
  message(STATUS "  sonatareport_INC  | ${sonatareport_INCLUDE_DIR}")
  message(STATUS "  sonatareport_LIB  | ${sonatareport_LIBRARY}")
  message(STATUS "  reportinglib_INC  | ${reportinglib_INCLUDE_DIR}")
  message(STATUS "  reportinglib_LIB  | ${reportinglib_LIBRARY}")
endif()
message(STATUS "--------------+--------------------------------------------------------------")
message(STATUS " See documentation : https://github.com/BlueBrain/CoreNeuron/")
message(STATUS "--------------+--------------------------------------------------------------")
message(STATUS "")


================================================
FILE: LICENSE.txt
================================================
Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
All rights reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
   this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its contributors
   may be used to endorse or promote products derived from this software
   without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
THE POSSIBILITY OF SUCH DAMAGE.


================================================
FILE: README.md
================================================
 :bangbang:
 **NOTE:** The CoreNEURON is now [integrated within NEURON](https://github.com/neuronsimulator/nrn/tree/master/src/coreneuron) simulator at the source level and hence all the latest development happens under the main GitHub project [neuronsimulator/nrn](https://github.com/neuronsimulator/nrn). To use CoreNEURON, see the latest NEURON documentation under [nrn.readthedocs.io](https://nrn.readthedocs.io/en/latest/).:bangbang:

_______________________________________________________

![CoreNEURON CI](https://github.com/BlueBrain/CoreNeuron/workflows/CoreNEURON%20CI/badge.svg) [![codecov](https://codecov.io/gh/BlueBrain/CoreNeuron/branch/master/graph/badge.svg?token=mguTdBx93p)](https://codecov.io/gh/BlueBrain/CoreNeuron)

![CoreNEURON](docs/_static/bluebrain_coreneuron.jpg)


## Citation

If you would like to know more about CoreNEURON or would like to cite it, then use the following paper:

* Pramod Kumbhar, Michael Hines, Jeremy Fouriaux, Aleksandr Ovcharenko, James King, Fabien Delalondre and Felix Schürmann. CoreNEURON : An Optimized Compute Engine for the NEURON Simulator ([doi.org/10.3389/fninf.2019.00063](https://doi.org/10.3389/fninf.2019.00063))

## License
* See LICENSE.txt
* See [NEURON](https://github.com/neuronsimulator/nrn)


## Funding

CoreNEURON is developed in a joint collaboration between the Blue Brain Project and Yale University. This work is supported by funding to the Blue Brain Project, a research center of the École polytechnique fédérale de Lausanne (EPFL), from the Swiss government’s ETH Board of the Swiss Federal Institutes of Technology, NIH grant number R01NS11613 (Yale University), the European Union Seventh Framework Program (FP7/20072013) under grant agreement n◦ 604102 (HBP) and the European Union’s Horizon 2020 Framework Programme for Research and Innovation under Specific Grant Agreement n◦ 720270 (Human Brain Project SGA1), n◦ 785907 (Human Brain Project SGA2) and n◦ 945539 (Human Brain Project SGA3).

Copyright (c) 2016 - 2022 Blue Brain Project/EPFL


================================================
FILE: coreneuron/CMakeLists.txt
================================================
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================

# Add compiler flags that should apply to all CoreNEURON targets, but which should not leak into
# other included projects.
add_compile_definitions(${CORENRN_COMPILE_DEFS})
add_compile_options(${CORENRN_EXTRA_CXX_FLAGS})
add_link_options(${CORENRN_EXTRA_LINK_FLAGS})

# put libraries (e.g. dll) in bin directory
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)

# =============================================================================
# gather various source files
# =============================================================================
file(
  GLOB
  CORENEURON_CODE_FILES
  "apps/main1.cpp"
  "apps/corenrn_parameters.cpp"
  "gpu/nrn_acc_manager.cpp"
  "io/*.cpp"
  "io/reports/*.cpp"
  "mechanism/*.cpp"
  "mpi/core/nrnmpi_def_cinc.cpp"
  "network/*.cpp"
  "permute/*.cpp"
  "sim/*.cpp"
  "sim/scopmath/abort.cpp"
  "sim/scopmath/newton_thread.cpp"
  "utils/*.cpp"
  "utils/*/*.c"
  "utils/*/*.cpp")
set(MPI_LIB_FILES "mpi/lib/mpispike.cpp" "mpi/lib/nrnmpi.cpp")
if(CORENRN_ENABLE_MPI)
  # Building these requires -ldl, which is only added if MPI is enabled.
  list(APPEND CORENEURON_CODE_FILES "mpi/core/resolve.cpp" "mpi/core/nrnmpidec.cpp")
endif()
file(COPY ${CORENEURON_PROJECT_SOURCE_DIR}/external/Random123/include/Random123
     DESTINATION ${CMAKE_BINARY_DIR}/include)
list(APPEND CORENEURON_CODE_FILES ${PROJECT_BINARY_DIR}/coreneuron/config/config.cpp)

set(ENGINEMECH_CODE_FILE "mechanism/mech/enginemech.cpp")

# for external mod files we need to generate modl_ref function in mod_func.c
set(MODFUNC_PERL_SCRIPT "mechanism/mech/mod_func.c.pl")

set(NMODL_UNITS_FILE "${CMAKE_BINARY_DIR}/share/mod2c/nrnunits.lib")

# =============================================================================
# Copy files that are required by nrnivmodl-core to the build tree at build time.
# =============================================================================
cpp_cc_build_time_copy(
  INPUT "${CMAKE_CURRENT_SOURCE_DIR}/${MODFUNC_PERL_SCRIPT}"
  OUTPUT "${CMAKE_BINARY_DIR}/share/coreneuron/mod_func.c.pl"
  NO_TARGET)
cpp_cc_build_time_copy(
  INPUT "${CMAKE_CURRENT_SOURCE_DIR}/${ENGINEMECH_CODE_FILE}"
  OUTPUT "${CMAKE_BINARY_DIR}/share/coreneuron/enginemech.cpp"
  NO_TARGET)
set(nrnivmodl_core_dependencies "${CMAKE_BINARY_DIR}/share/coreneuron/mod_func.c.pl"
                                "${CMAKE_BINARY_DIR}/share/coreneuron/enginemech.cpp")
# Set up build rules that copy builtin mod files from
# {source}/coreneuron/mechanism/mech/modfile/*.mod to {build_dir}/share/modfile/
file(GLOB builtin_modfiles
     "${CORENEURON_PROJECT_SOURCE_DIR}/coreneuron/mechanism/mech/modfile/*.mod")
foreach(builtin_modfile ${builtin_modfiles})
  # Construct the path in the build directory.
  get_filename_component(builtin_modfile_name "${builtin_modfile}" NAME)
  set(modfile_build_path "${CMAKE_BINARY_DIR}/share/modfile/${builtin_modfile_name}")
  # Create a build rule to copy the modfile there.
  cpp_cc_build_time_copy(
    INPUT "${builtin_modfile}"
    OUTPUT "${modfile_build_path}"
    NO_TARGET)
  list(APPEND nrnivmodl_core_dependencies "${modfile_build_path}")
endforeach()
add_custom_target(coreneuron-copy-nrnivmodl-core-dependencies ALL
                  DEPENDS ${nrnivmodl_core_dependencies})
# Store the build-tree modfile paths in a cache variable; these are an implicit dependency of
# nrnivmodl-core.
set(CORENEURON_BUILTIN_MODFILES
    "${nrnivmodl_core_dependencies}"
    CACHE STRING "List of builtin modfiles that nrnivmodl-core implicitly depends on" FORCE)

# =============================================================================
# coreneuron GPU library
# =============================================================================
if(CORENRN_ENABLE_GPU)
  # ~~~
  # artificial cells and some other cpp files (using Random123) should be compiled
  # without OpenACC to avoid use of GPU Random123 streams
  # OL210813: this shouldn't be needed anymore, but it may have a small performance benefit
  # ~~~
  set(OPENACC_EXCLUDED_FILES
      ${CMAKE_CURRENT_BINARY_DIR}/netstim.cpp
      ${CMAKE_CURRENT_BINARY_DIR}/netstim_inhpoisson.cpp
      ${CMAKE_CURRENT_BINARY_DIR}/pattern.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/io/nrn_setup.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/io/setup_fornetcon.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/io/corenrn_data_return.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/io/global_vars.cpp)

  set_source_files_properties(${OPENACC_EXCLUDED_FILES} PROPERTIES COMPILE_FLAGS
                                                                   "-DDISABLE_OPENACC")
  # Only compile the explicit CUDA implementation of the Hines solver in GPU builds. Because of
  # https://forums.developer.nvidia.com/t/cannot-dynamically-load-a-shared-library-containing-both-openacc-and-cuda-code/210972
  # this cannot be included in the same shared library as the rest of the OpenACC code.
  set(CORENEURON_CUDA_FILES ${CMAKE_CURRENT_SOURCE_DIR}/permute/cellorder.cu)

  # Eigen functions cannot be called directly from OpenACC regions, but Eigen is sort-of compatible
  # with being compiled as CUDA code. Because of
  # https://forums.developer.nvidia.com/t/cannot-dynamically-load-a-shared-library-containing-both-openacc-and-cuda-code/210972
  # this has to mean `nvc++ -cuda` rather than `nvcc`. We explicitly instantiate Eigen functions for
  # different matrix sizes in partial_piv_lu.cpp (with CUDA attributes but without OpenACC or OpenMP
  # annotations) and dispatch to these from a wrapper in partial_piv_lu.h that does have
  # OpenACC/OpenMP annotations.
  if(CORENRN_ENABLE_NMODL AND EXISTS ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cpp)
    list(APPEND CORENEURON_CODE_FILES ${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cpp)
    if(CORENRN_ENABLE_GPU
       AND CORENRN_HAVE_NVHPC_COMPILER
       AND CMAKE_BUILD_TYPE STREQUAL "Debug")
      # In this case OpenAccHelper.cmake passes -gpu=debug, which makes these Eigen functions
      # extremely slow. Downgrade that to -gpu=lineinfo for this file.
      set_source_files_properties(${CORENRN_MOD2CPP_INCLUDE}/partial_piv_lu/partial_piv_lu.cpp
                                  PROPERTIES COMPILE_FLAGS "-gpu=lineinfo,nodebug -O1")
    endif()
  endif()
endif()

# =============================================================================
# create libraries
# =============================================================================

# name of coreneuron mpi objects or dynamic library
set(CORENRN_MPI_LIB_NAME
    "corenrn_mpi"
    CACHE INTERNAL "")

# for non-dynamic mpi mode just build object files
if(CORENRN_ENABLE_MPI AND NOT CORENRN_ENABLE_MPI_DYNAMIC)
  add_library(${CORENRN_MPI_LIB_NAME} OBJECT ${MPI_LIB_FILES})
  target_include_directories(
    ${CORENRN_MPI_LIB_NAME} PRIVATE ${MPI_INCLUDE_PATH} ${CORENEURON_PROJECT_SOURCE_DIR}
                                    ${CORENEURON_PROJECT_BINARY_DIR}/generated)
  target_link_libraries(${CORENRN_MPI_LIB_NAME} ${CORENRN_CALIPER_LIB})
  set_property(TARGET ${CORENRN_MPI_LIB_NAME} PROPERTY POSITION_INDEPENDENT_CODE ON)
  set(CORENRN_MPI_OBJ $<TARGET_OBJECTS:${CORENRN_MPI_LIB_NAME}>)
endif()

# Library containing the bulk of the non-mechanism CoreNEURON code. This is always created and
# installed as a static library, and then the nrnivmodl-core workflow extracts the object files from
# it and does one of the following:
#
# * shared build: creates libcorenrnmech.so from these objects plus those from the translated MOD
#   files
# * static build: creates a (temporary, does not get installed) libcorenrnmech.a from these objects
#   plus those from the translated MOD files, then statically links that into special-core
#   (nrniv-core)
#
# This scheme means that both core and mechanism .o files are linked in a single step, which is
# important for GPU linking. It does, however, mean that the core code is installed twice, once in
# libcoreneuron-core.a and once in libcorenrnmech.so (shared) or nrniv-core (static). In a GPU
# build, libcoreneuron-cuda.{a,so} is also linked to provide the CUDA implementation of the Hines
# solver. This cannot be included in coreneuron-core because of this issue:
# https://forums.developer.nvidia.com/t/cannot-dynamically-load-a-shared-library-containing-both-openacc-and-cuda-code/210972
add_library(coreneuron-core STATIC ${CORENEURON_CODE_FILES} ${CORENRN_MPI_OBJ})
if(CORENRN_ENABLE_GPU)
  set(coreneuron_cuda_target coreneuron-cuda)
  add_library(coreneuron-cuda ${COMPILE_LIBRARY_TYPE} ${CORENEURON_CUDA_FILES})
  target_link_libraries(coreneuron-core PUBLIC coreneuron-cuda)
endif()

foreach(target coreneuron-core ${coreneuron_cuda_target})
  target_include_directories(${target} PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}
                                               ${CORENEURON_PROJECT_BINARY_DIR}/generated)
endforeach()

# we can link to MPI libraries in non-dynamic-mpi build
if(CORENRN_ENABLE_MPI AND NOT CORENRN_ENABLE_MPI_DYNAMIC)
  target_link_libraries(coreneuron-core PUBLIC ${MPI_CXX_LIBRARIES})
endif()

# ~~~
# main coreneuron library needs to be linked to libdl.so
# only in case of dynamic mpi build. But on old system
# like centos7, we saw mpich library require explici
# link to libdl.so. See
#   https://github.com/neuronsimulator/nrn-build-ci/pull/51
# ~~~
target_link_libraries(coreneuron-core PUBLIC ${CMAKE_DL_LIBS})

# this is where we handle dynamic mpi library build
if(CORENRN_ENABLE_MPI AND CORENRN_ENABLE_MPI_DYNAMIC)
  # store mpi library targets that will be built
  list(APPEND corenrn_mpi_targets "")

  # ~~~
  # if coreneuron is built as a submodule of neuron then check if NEURON has created
  # list of libraries that needs to be built. We use neuron cmake variables here because
  # we don't need to duplicate CMake code into coreneuron (we want to have unified cmake
  # project soon). In the absense of neuron just build a single library libcorenrn_mpi.
  # This is mostly used for the testing.
  # ~~~
  if(NOT CORENEURON_AS_SUBPROJECT)
    add_library(${CORENRN_MPI_LIB_NAME} SHARED ${MPI_LIB_FILES})
    target_link_libraries(${CORENRN_MPI_LIB_NAME} ${MPI_CXX_LIBRARIES})
    target_include_directories(
      ${CORENRN_MPI_LIB_NAME} PRIVATE ${MPI_INCLUDE_PATH} ${CORENEURON_PROJECT_SOURCE_DIR}
                                      ${CORENEURON_PROJECT_BINARY_DIR}/generated)
    set_property(TARGET ${CORENRN_MPI_LIB_NAME} PROPERTY POSITION_INDEPENDENT_CODE ON)
    list(APPEND corenrn_mpi_targets ${CORENRN_MPI_LIB_NAME})
  else()
    # ~~~
    # from neuron we know how many different libraries needs to be built, their names
    # include paths to be used for building shared libraries. Iterate through those
    # and build separate library for each MPI distribution. For example, following
    # libraries are created:
    # - libcorenrn_mpich.so
    # - libcorenrn_ompi.so
    # - libcorenrn_mpt.so
    # ~~~
    list(LENGTH NRN_MPI_LIBNAME_LIST _num_mpi)
    math(EXPR num_mpi "${_num_mpi} - 1")
    foreach(val RANGE ${num_mpi})
      list(GET NRN_MPI_INCLUDE_LIST ${val} include)
      list(GET NRN_MPI_LIBNAME_LIST ${val} libname)

      add_library(core${libname}_lib SHARED ${MPI_LIB_FILES})
      target_link_libraries(core${libname}_lib ${CORENRN_CALIPER_LIB})
      target_include_directories(
        core${libname}_lib
        PUBLIC ${include}
        PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR} ${CORENEURON_PROJECT_BINARY_DIR}/generated)

      # ~~~
      # TODO: somehow mingw requires explicit linking. This needs to be verified
      # when we will test coreneuron on windows.
      # ~~~
      if(MINGW) # type msmpi only
        add_dependencies(core${libname}_lib coreneuron-core)
        target_link_libraries(core${libname}_lib ${MPI_C_LIBRARIES} coreneuron-core)
      endif()
      set_property(TARGET core${libname}_lib PROPERTY OUTPUT_NAME core${libname})
      list(APPEND corenrn_mpi_targets "core${libname}_lib")
    endforeach(val)
  endif()

  set_target_properties(
    ${corenrn_mpi_targets}
    PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
               LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
               POSITION_INDEPENDENT_CODE ON)
  install(TARGETS ${corenrn_mpi_targets} DESTINATION lib)
endif()

# Suppress some compiler warnings.
target_compile_options(coreneuron-core PRIVATE ${CORENEURON_CXX_WARNING_SUPPRESSIONS})
target_link_libraries(coreneuron-core PUBLIC ${reportinglib_LIBRARY} ${sonatareport_LIBRARY}
                                             ${CORENRN_CALIPER_LIB} ${likwid_LIBRARIES})

# TODO: fix adding a dependency of coreneuron-core on CLI11::CLI11 when CLI11 is a submodule. Right
# now this doesn't work because the CLI11 targets are not exported/installed but coreneuron-core is.
get_target_property(CLI11_HEADER_DIRECTORY CLI11::CLI11 INTERFACE_INCLUDE_DIRECTORIES)
target_include_directories(
  coreneuron-core SYSTEM PRIVATE ${CLI11_HEADER_DIRECTORY}
                                 ${CORENEURON_PROJECT_SOURCE_DIR}/external/Random123/include)

# See: https://en.cppreference.com/w/cpp/filesystem#Notes
if(CMAKE_CXX_COMPILER_IS_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.1)
  target_link_libraries(coreneuron-core PUBLIC stdc++fs)
endif()

if(CORENRN_ENABLE_GPU)
  # nrnran123.cpp uses Boost.Pool in GPU builds if it's available.
  find_package(Boost QUIET)
  if(Boost_FOUND)
    message(STATUS "Boost found, enabling use of memory pools for Random123...")
    target_include_directories(coreneuron-core SYSTEM PRIVATE ${Boost_INCLUDE_DIRS})
    target_compile_definitions(coreneuron-core PRIVATE CORENEURON_USE_BOOST_POOL)
  endif()
endif()

set_target_properties(
  coreneuron-core ${coreneuron_cuda_target}
  PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
             LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
             POSITION_INDEPENDENT_CODE ${CORENRN_ENABLE_SHARED})
cpp_cc_configure_sanitizers(TARGET coreneuron-core ${coreneuron_cuda_target} ${corenrn_mpi_targets})

# =============================================================================
# create special-core with halfgap.mod for tests
# =============================================================================
set(modfile_directory "${CORENEURON_PROJECT_SOURCE_DIR}/tests/integration/ring_gap/mod files")
file(GLOB modfiles "${modfile_directory}/*.mod")

# We have to link things like unit tests against this because some "core" .cpp files refer to
# symbols in the translated versions of default .mod files
set(nrniv_core_prefix "${CMAKE_BINARY_DIR}/bin/${CMAKE_SYSTEM_PROCESSOR}")
set(corenrn_mech_library
    "${nrniv_core_prefix}/${CMAKE_${COMPILE_LIBRARY_TYPE}_LIBRARY_PREFIX}corenrnmech${CMAKE_${COMPILE_LIBRARY_TYPE}_LIBRARY_SUFFIX}"
)
set(output_binaries "${nrniv_core_prefix}/special-core" "${corenrn_mech_library}")

add_custom_command(
  OUTPUT ${output_binaries}
  DEPENDS coreneuron-core ${NMODL_TARGET_TO_DEPEND} ${modfiles} ${CORENEURON_BUILTIN_MODFILES}
  COMMAND ${CMAKE_BINARY_DIR}/bin/nrnivmodl-core -b ${COMPILE_LIBRARY_TYPE} -m
          ${CORENRN_MOD2CPP_BINARY} -p 4 "${modfile_directory}"
  WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/bin
  COMMENT "Running nrnivmodl-core with halfgap.mod")
add_custom_target(nrniv-core ALL DEPENDS ${output_binaries})

# Build a target representing the libcorenrnmech.so that is produced under bin/x86_64, which
# executables such as the unit tests must link against
add_library(builtin-libcorenrnmech SHARED IMPORTED)
add_dependencies(builtin-libcorenrnmech nrniv-core)
set_target_properties(builtin-libcorenrnmech PROPERTIES IMPORTED_LOCATION "${corenrn_mech_library}")

if(CORENRN_ENABLE_GPU)
  separate_arguments(CORENRN_ACC_FLAGS UNIX_COMMAND "${NVHPC_ACC_COMP_FLAGS}")
  target_compile_options(coreneuron-core PRIVATE ${CORENRN_ACC_FLAGS})
endif()

# Create an extra target for use by NEURON when CoreNEURON is being built as a submodule. NEURON
# tests will depend on this, so it must in turn depend on everything that is needed to run nrnivmodl
# -coreneuron.
add_custom_target(coreneuron-for-tests)
add_dependencies(coreneuron-for-tests coreneuron-core ${NMODL_TARGET_TO_DEPEND})
# Create an extra target for internal use that unit tests and so on can depend on.
# ${corenrn_mech_library} is libcorenrnmech.{a,so}, which contains both the compiled default
# mechanisms and the content of libcoreneuron-core.a.
add_library(coreneuron-all INTERFACE)
target_link_libraries(coreneuron-all INTERFACE builtin-libcorenrnmech)
# Also copy the dependencies of libcoreneuron-core as interface dependencies of this new target
# (example: ${corenrn_mech_library} will probably depend on MPI, so when the unit tests link against
# ${corenrn_mech_library} they need to know to link against MPI too).
get_target_property(coreneuron_core_deps coreneuron-core LINK_LIBRARIES)
if(coreneuron_core_deps)
  foreach(dep ${coreneuron_core_deps})
    target_link_libraries(coreneuron-all INTERFACE ${dep})
  endforeach()
endif()

# Make headers avail to build tree
configure_file(engine.h.in ${CMAKE_BINARY_DIR}/include/coreneuron/engine.h @ONLY)

file(
  GLOB_RECURSE main_headers
  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
  *.h *.hpp)

configure_file("${CORENEURON_PROJECT_BINARY_DIR}/generated/coreneuron/config/neuron_version.hpp"
               "${CMAKE_BINARY_DIR}/include/coreneuron/config/neuron_version.hpp" COPYONLY)
foreach(header ${main_headers})
  configure_file("${header}" "${CMAKE_BINARY_DIR}/include/coreneuron/${header}" COPYONLY)
endforeach()

configure_file("utils/profile/profiler_interface.h"
               ${CMAKE_BINARY_DIR}/include/coreneuron/nrniv/profiler_interface.h COPYONLY)

# main program required for building special-core
file(COPY apps/coreneuron.cpp DESTINATION ${CMAKE_BINARY_DIR}/share/coreneuron)

# =============================================================================
# Install main targets
# =============================================================================

# coreneuron main libraries
install(
  TARGETS coreneuron-core ${coreneuron_cuda_target}
  EXPORT coreneuron
  LIBRARY DESTINATION lib
  ARCHIVE DESTINATION lib
  INCLUDES
  DESTINATION $<INSTALL_INTERFACE:include>)

# headers and some standalone code files for nrnivmodl-core
install(
  DIRECTORY ${CMAKE_BINARY_DIR}/include/coreneuron
  DESTINATION include/
  FILES_MATCHING
  PATTERN "*.h*"
  PATTERN "*.ipp")
install(FILES ${MODFUNC_PERL_SCRIPT} ${ENGINEMECH_CODE_FILE} DESTINATION share/coreneuron)

# copy mod2c/nmodl for nrnivmodl-core
install(PROGRAMS ${CORENRN_MOD2CPP_BINARY} DESTINATION bin)

if(NOT CORENRN_ENABLE_NMODL)
  install(FILES ${NMODL_UNITS_FILE} DESTINATION share/mod2c)
endif()

# install nrniv-core app
install(
  PROGRAMS ${CMAKE_BINARY_DIR}/bin/${CMAKE_HOST_SYSTEM_PROCESSOR}/special-core
  DESTINATION bin
  RENAME nrniv-core)
install(FILES apps/coreneuron.cpp DESTINATION share/coreneuron)

# install mechanism library in shared library builds, if we're linking statically then there is no
# need
if(CORENRN_ENABLE_SHARED)
  install(FILES ${corenrn_mech_library} DESTINATION lib)
endif()

# install random123 and nmodl headers
install(DIRECTORY ${CMAKE_BINARY_DIR}/include/ DESTINATION include)

# install mod files
install(DIRECTORY ${CMAKE_BINARY_DIR}/share/modfile DESTINATION share)


================================================
FILE: coreneuron/apps/coreneuron.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#include <coreneuron/engine.h>
#include "coreneuron/utils/profile/profiler_interface.h"

int main(int argc, char** argv) {
    coreneuron::Instrumentor::init_profile();
    auto solve_core_result = solve_core(argc, argv);
    coreneuron::Instrumentor::finalize_profile();
    return solve_core_result;
}


================================================
FILE: coreneuron/apps/corenrn_parameters.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/
#include "coreneuron/apps/corenrn_parameters.hpp"

#include <CLI/CLI.hpp>

namespace coreneuron {

extern std::string cnrn_version();

corenrn_parameters::corenrn_parameters()
    : m_app{std::make_unique<CLI::App>("CoreNeuron - Optimised Simulator Engine for NEURON.")} {
    auto& app = *m_app;
    app.set_config("--read-config", "", "Read parameters from ini file", false)
        ->check(CLI::ExistingFile);
    app.add_option("--write-config",
                   this->writeParametersFilepath,
                   "Write parameters to this file",
                   false);

    app.add_flag(
        "--mpi",
        this->mpi_enable,
        "Enable MPI. In order to initialize MPI environment this argument must be specified.");
    app.add_option("--mpi-lib",
                   this->mpi_lib,
                   "CoreNEURON MPI library to load for dynamic MPI support",
                   false);
    app.add_flag("--gpu", this->gpu, "Activate GPU computation.");
    app.add_option("--dt",
                   this->dt,
                   "Fixed time step. The default value is set by defaults.dat or is 0.025.",
                   true)
        ->check(CLI::Range(-1'000., 1e9));
    app.add_option("-e, --tstop", this->tstop, "Stop Time in ms.")->check(CLI::Range(0., 1e9));
    app.add_flag("--show");
    app.add_set(
        "--verbose",
        this->verbose,
        {verbose_level::NONE, verbose_level::ERROR, verbose_level::INFO, verbose_level::DEBUG_INFO},
        "Verbose level: 0 = NONE, 1 = ERROR, 2 = INFO, 3 = DEBUG. Default is INFO");
    app.add_flag("--model-stats",
                 this->model_stats,
                 "Print number of instances of each mechanism and detailed memory stats.");

    auto sub_gpu = app.add_option_group("GPU", "Commands relative to GPU.");
    sub_gpu
        ->add_option("-W, --nwarp",
                     this->nwarp,
                     "Number of warps to execute in parallel the Hines solver. Each warp solves a "
                     "group of cells. (Only used with cell permute 2)",
                     true)
        ->check(CLI::Range(0, 1'000'000));
    sub_gpu
        ->add_option("-R, --cell-permute",
                     this->cell_interleave_permute,
                     "Cell permutation: 0 No permutation; 1 optimise node adjacency; 2 optimize "
                     "parent adjacency.",
                     true)
        ->check(CLI::Range(0, 2));
    sub_gpu->add_flag("--cuda-interface",
                      this->cuda_interface,
                      "Activate CUDA branch of the code.");
    sub_gpu->add_option("-n, --num-gpus", this->num_gpus, "Number of gpus to use per node.");

    auto sub_input = app.add_option_group("input", "Input dataset options.");
    sub_input->add_option("-d, --datpath", this->datpath, "Path containing CoreNeuron data files.")
        ->check(CLI::ExistingDirectory);
    sub_input->add_option("-f, --filesdat", this->filesdat, "Name for the distribution file.", true)
        ->check(CLI::ExistingFile);
    sub_input
        ->add_option("-p, --pattern",
                     this->patternstim,
                     "Apply patternstim using the specified spike file.")
        ->check(CLI::ExistingFile);
    sub_input
        ->add_option("-s, --seed", this->seed, "Initialization seed for random number generator.")
        ->check(CLI::Range(0, 100'000'000));
    sub_input
        ->add_option("-v, --voltage",
                     this->voltage,
                     "Initial voltage used for nrn_finitialize(1, v_init). If 1000, then "
                     "nrn_finitialize(0,...).")
        ->check(CLI::Range(-1e9, 1e9));
    sub_input->add_option("--report-conf", this->reportfilepath, "Reports configuration file.")
        ->check(CLI::ExistingFile);
    sub_input
        ->add_option("--restore",
                     this->restorepath,
                     "Restore simulation from provided checkpoint directory.")
        ->check(CLI::ExistingDirectory);

    auto sub_parallel = app.add_option_group("parallel", "Parallel processing options.");
    sub_parallel->add_flag("-c, --threading",
                           this->threading,
                           "Parallel threads. The default is serial threads.");
    sub_parallel->add_flag("--skip-mpi-finalize",
                           this->skip_mpi_finalize,
                           "Do not call mpi finalize.");

    auto sub_spike = app.add_option_group("spike", "Spike exchange options.");
    sub_spike
        ->add_option("--ms-phases", this->ms_phases, "Number of multisend phases, 1 or 2.", true)
        ->check(CLI::Range(1, 2));
    sub_spike
        ->add_option("--ms-subintervals",
                     this->ms_subint,
                     "Number of multisend subintervals, 1 or 2.",
                     true)
        ->check(CLI::Range(1, 2));
    sub_spike->add_flag("--multisend",
                        this->multisend,
                        "Use Multisend spike exchange instead of Allgather.");
    sub_spike
        ->add_option("--spkcompress",
                     this->spkcompress,
                     "Spike compression. Up to ARG are exchanged during MPI_Allgather.",
                     true)
        ->check(CLI::Range(0, 100'000));
    sub_spike->add_flag("--binqueue", this->binqueue, "Use bin queue.");

    auto sub_config = app.add_option_group("config", "Config options.");
    sub_config->add_option("-b, --spikebuf", this->spikebuf, "Spike buffer size.", true)
        ->check(CLI::Range(0, 2'000'000'000));
    sub_config
        ->add_option("-g, --prcellgid",
                     this->prcellgid,
                     "Output prcellstate information for the gid NUMBER.")
        ->check(CLI::Range(-1, 2'000'000'000));
    sub_config->add_option("-k, --forwardskip", this->forwardskip, "Forwardskip to TIME")
        ->check(CLI::Range(0., 1e9));
    sub_config
        ->add_option(
            "-l, --celsius",
            this->celsius,
            "Temperature in degC. The default value is set in defaults.dat or else is 34.0.",
            true)
        ->check(CLI::Range(-1000., 1000.));
    sub_config
        ->add_option("--mindelay",
                     this->mindelay,
                     "Maximum integration interval (likely reduced by minimum NetCon delay).",
                     true)
        ->check(CLI::Range(0., 1e9));
    sub_config
        ->add_option("--report-buffer-size",
                     this->report_buff_size,
                     "Size in MB of the report buffer.")
        ->check(CLI::Range(1, 128));

    auto sub_output = app.add_option_group("output", "Output configuration.");
    sub_output->add_option("-i, --dt_io", this->dt_io, "Dt of I/O.", true)
        ->check(CLI::Range(-1000., 1e9));
    sub_output->add_option("-o, --outpath",
                           this->outpath,
                           "Path to place output data files.",
                           true);
    sub_output->add_option("--checkpoint",
                           this->checkpointpath,
                           "Enable checkpoint and specify directory to store related files.");

    app.add_flag("-v, --version", this->show_version, "Show version information and quit.");

    CLI::retire_option(app, "--show");
}

// Implementation in .cpp file where CLI types are complete.
corenrn_parameters::~corenrn_parameters() = default;

std::string corenrn_parameters::config_to_str(bool default_also, bool write_description) const {
    return m_app->config_to_str(default_also, write_description);
}

void corenrn_parameters::reset() {
    static_cast<corenrn_parameters_data&>(*this) = corenrn_parameters_data{};
    m_app->clear();
}

void corenrn_parameters::parse(int argc, char** argv) {
    try {
        m_app->parse(argc, argv);
        if (verbose == verbose_level::NONE) {
            nrn_nobanner_ = 1;
        }
    } catch (const CLI::ExtrasError& e) {
        // in case of parsing errors, show message with exception
        std::cerr << "CLI parsing error, see nrniv-core --help for more information. \n"
                  << std::endl;
        m_app->exit(e);
        throw e;
    } catch (const CLI::ParseError& e) {
        // use --help is also ParseError; in this case exit by showing all options
        m_app->exit(e);
        exit(0);
    }

#ifndef CORENEURON_ENABLE_GPU
    if (gpu) {
        std::cerr
            << "Error: GPU support was not enabled at build time but GPU execution was requested."
            << std::endl;
        exit(42);
    }
#endif

    // is user has asked for version info, print it and exit
    if (show_version) {
        std::cout << "CoreNEURON Version : " << cnrn_version() << std::endl;
        exit(0);
    }
};

std::ostream& operator<<(std::ostream& os, const corenrn_parameters& corenrn_param) {
    os << "GENERAL PARAMETERS" << std::endl
       << "--mpi=" << (corenrn_param.mpi_enable ? "true" : "false") << std::endl
       << "--mpi-lib=" << corenrn_param.mpi_lib << std::endl
       << "--gpu=" << (corenrn_param.gpu ? "true" : "false") << std::endl
       << "--dt=" << corenrn_param.dt << std::endl
       << "--tstop=" << corenrn_param.tstop << std::endl
       << std::endl
       << "GPU" << std::endl
       << "--nwarp=" << corenrn_param.nwarp << std::endl
       << "--cell-permute=" << corenrn_param.cell_interleave_permute << std::endl
       << "--cuda-interface=" << (corenrn_param.cuda_interface ? "true" : "false") << std::endl
       << std::endl
       << "INPUT PARAMETERS" << std::endl
       << "--voltage=" << corenrn_param.voltage << std::endl
       << "--seed=" << corenrn_param.seed << std::endl
       << "--datpath=" << corenrn_param.datpath << std::endl
       << "--filesdat=" << corenrn_param.filesdat << std::endl
       << "--pattern=" << corenrn_param.patternstim << std::endl
       << "--report-conf=" << corenrn_param.reportfilepath << std::endl
       << std::left << std::setw(15) << "--restore=" << corenrn_param.restorepath << std::endl
       << std::endl
       << "PARALLEL COMPUTATION PARAMETERS" << std::endl
       << "--threading=" << (corenrn_param.threading ? "true" : "false") << std::endl
       << "--skip_mpi_finalize=" << (corenrn_param.skip_mpi_finalize ? "true" : "false")
       << std::endl
       << std::endl
       << "SPIKE EXCHANGE" << std::endl
       << "--ms_phases=" << corenrn_param.ms_phases << std::endl
       << "--ms_subintervals=" << corenrn_param.ms_subint << std::endl
       << "--multisend=" << (corenrn_param.multisend ? "true" : "false") << std::endl
       << "--spk_compress=" << corenrn_param.spkcompress << std::endl
       << "--binqueue=" << (corenrn_param.binqueue ? "true" : "false") << std::endl
       << std::endl
       << "CONFIGURATION" << std::endl
       << "--spikebuf=" << corenrn_param.spikebuf << std::endl
       << "--prcellgid=" << corenrn_param.prcellgid << std::endl
       << "--forwardskip=" << corenrn_param.forwardskip << std::endl
       << "--celsius=" << corenrn_param.celsius << std::endl
       << "--mindelay=" << corenrn_param.mindelay << std::endl
       << "--report-buffer-size=" << corenrn_param.report_buff_size << std::endl
       << std::endl
       << "OUTPUT PARAMETERS" << std::endl
       << "--dt_io=" << corenrn_param.dt_io << std::endl
       << "--outpath=" << corenrn_param.outpath << std::endl
       << "--checkpoint=" << corenrn_param.checkpointpath << std::endl;

    return os;
}

corenrn_parameters corenrn_param;
int nrn_nobanner_{0};

}  // namespace coreneuron


================================================
FILE: coreneuron/apps/corenrn_parameters.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/
#pragma once
#include <memory>
#include <ostream>
#include <string>

/**
 * \class corenrn_parameters
 * \brief Parses and contains Command Line parameters for Core Neuron
 *
 * This structure contains all the parameters that CoreNeuron fetches
 * from the Command Line. It uses the CLI11 libraries to parse these parameters
 * and saves them in an internal public structure. Each parameter can be
 * accessed or written freely. By default the constructor instantiates a
 * CLI11 object and initializes it for CoreNeuron use.
 * This object is freely accessible from any point of the program.
 * An ostream method is also provided to print out all the parameters that
 * CLI11 parse.
 * Please keep in mind that, due to the nature of the subcommands in CLI11,
 * the command line parameters for subcategories NEED to be come before the relative
 * parameter. e.g. --mpi --gpu gpu --nwarp
 * Also single dash long options are not supported anymore (-mpi -> --mpi).
 */

namespace CLI {
struct App;
}

namespace coreneuron {

struct corenrn_parameters_data {
    enum verbose_level : std::uint32_t {
        NONE = 0,
        ERROR = 1,
        INFO = 2,
        DEBUG_INFO = 3,
        DEFAULT = INFO
    };

    static constexpr int report_buff_size_default = 4;

    unsigned spikebuf = 100'000;           /// Internal buffer used on every rank for spikes
    int prcellgid = -1;                    /// Gid of cell for prcellstate
    unsigned ms_phases = 2;                /// Number of multisend phases, 1 or 2
    unsigned ms_subint = 2;                /// Number of multisend interval. 1 or 2
    unsigned spkcompress = 0;              /// Spike Compression
    unsigned cell_interleave_permute = 0;  /// Cell interleaving permutation
    unsigned nwarp = 65536;  /// Number of warps to balance for cell_interleave_permute == 2
    unsigned num_gpus = 0;   /// Number of gpus to use per node
    unsigned report_buff_size = report_buff_size_default;  /// Size in MB of the report buffer.
    int seed = -1;  /// Initialization seed for random number generator (int)

    bool mpi_enable = false;         /// Enable MPI flag.
    bool skip_mpi_finalize = false;  /// Skip MPI finalization
    bool multisend = false;          /// Use Multisend spike exchange instead of Allgather.
    bool threading = false;          /// Enable pthread/openmp
    bool gpu = false;                /// Enable GPU computation.
    bool cuda_interface = false;     /// Enable CUDA interface (default is the OpenACC interface).
                                  /// Branch of the code is executed through CUDA kernels instead of
                                  /// OpenACC regions.
    bool binqueue = false;  /// Use bin queue.

    bool show_version = false;  /// Print version and exit.

    bool model_stats = false;  /// Print mechanism counts and model size after initialization

    verbose_level verbose{verbose_level::DEFAULT};  /// Verbosity-level

    double tstop = 100;        /// Stop time of simulation in msec
    double dt = -1000.0;       /// Timestep to use in msec
    double dt_io = 0.1;        /// I/O timestep to use in msec
    double dt_report;          /// I/O timestep to use in msec for reports
    double celsius = -1000.0;  /// Temperature in degC.
    double voltage = -65.0;    /// Initial voltage used for nrn_finitialize(1, v_init).
    double forwardskip = 0.;   /// Forward skip to TIME.
    double mindelay = 10.;     /// Maximum integration interval (likely reduced by minimum NetCon
                               /// delay).

    std::string patternstim;             /// Apply patternstim using the specified spike file.
    std::string datpath = ".";           /// Directory path where .dat files
    std::string outpath = ".";           /// Directory where spikes will be written
    std::string filesdat = "files.dat";  /// Name of file containing list of gids dat files read in
    std::string restorepath;             /// Restore simulation from provided checkpoint directory.
    std::string reportfilepath;          /// Reports configuration file.
    std::string checkpointpath;  /// Enable checkpoint and specify directory to store related files.
    std::string writeParametersFilepath;  /// Write parameters to this file
    std::string mpi_lib;                  /// Name of CoreNEURON MPI library to load dynamically.
};

struct corenrn_parameters: corenrn_parameters_data {
    corenrn_parameters();   /// Constructor that initializes the CLI11 app.
    ~corenrn_parameters();  /// Destructor defined in .cpp where CLI11 types are complete.

    void parse(int argc, char* argv[]);  /// Runs the CLI11_PARSE macro.

    /** @brief Reset all parameters to their default values.
     *
     *  Unfortunately it is awkward to support `x = corenrn_parameters{}`
     *  because `app` holds pointers to members of `corenrn_parameters`.
     */
    void reset();

    inline bool is_quiet() {
        return verbose == verbose_level::NONE;
    }

    /** @brief Return a string summarising the current parameter values.
     *
     * This forwards to the CLI11 method of the same name. Returns a string that
     * could be read in as a config of the current values of the App.
     *
     * @param default_also Include any defaulted arguments.
     * @param write_description Include option descriptions and the App description.
     */
    std::string config_to_str(bool default_also = false, bool write_description = false) const;

  private:
    // CLI app that performs CLI parsing. std::unique_ptr avoids having to
    // include CLI11 headers from CoreNEURON headers, and therefore avoids
    // CoreNEURON having to install CLI11 when using it from a submodule.
    std::unique_ptr<CLI::App> m_app;
};

std::ostream& operator<<(std::ostream& os,
                         const corenrn_parameters& corenrn_param);  /// Printing method.

extern corenrn_parameters corenrn_param;  /// Declaring global corenrn_parameters object for this
                                          /// instance of CoreNeuron.
extern int nrn_nobanner_;                 /// Global no banner setting

}  // namespace coreneuron


================================================
FILE: coreneuron/apps/main1.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

/**
 * @file main1.cpp
 * @date 26 Oct 2014
 * @brief File containing main driver routine for CoreNeuron
 */

#include <cstring>
#include <climits>
#include <dlfcn.h>
#include <memory>
#include <vector>

#include "coreneuron/config/config.h"
#include "coreneuron/utils/randoms/nrnran123.h"
#include "coreneuron/nrnconf.h"
#include "coreneuron/sim/fast_imem.hpp"
#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/mpi/nrnmpi.h"
#include "coreneuron/nrniv/nrniv_decl.h"
#include "coreneuron/mechanism/register_mech.hpp"
#include "coreneuron/io/output_spikes.hpp"
#include "coreneuron/io/nrn_checkpoint.hpp"
#include "coreneuron/utils/memory_utils.h"
#include "coreneuron/apps/corenrn_parameters.hpp"
#include "coreneuron/io/prcellstate.hpp"
#include "coreneuron/utils/nrn_stats.h"
#include "coreneuron/io/reports/nrnreport.hpp"
#include "coreneuron/io/reports/binary_report_handler.hpp"
#include "coreneuron/io/reports/report_handler.hpp"
#include "coreneuron/io/reports/sonata_report_handler.hpp"
#include "coreneuron/gpu/nrn_acc_manager.hpp"
#include "coreneuron/utils/profile/profiler_interface.h"
#include "coreneuron/network/partrans.hpp"
#include "coreneuron/network/multisend.hpp"
#include "coreneuron/io/nrn_setup.hpp"
#include "coreneuron/io/file_utils.hpp"
#include "coreneuron/io/nrn2core_direct.h"
#include "coreneuron/io/core2nrn_data_return.hpp"
#include "coreneuron/utils/utils.hpp"

extern "C" {
const char* corenrn_version() {
    return coreneuron::bbcore_write_version;
}

// the CORENEURON_USE_LEGACY_UNITS determined by CORENRN_ENABLE_LEGACY_UNITS
bool corenrn_units_use_legacy() {
    return CORENEURON_USE_LEGACY_UNITS;
}

void (*nrn2core_part2_clean_)();

/**
 * If "export OMP_NUM_THREADS=n" is not set then omp by default sets
 * the number of threads equal to the number of cores on this node.
 * If there are a number of mpi processes on this node as well, things
 * can go very slowly as there are so many more threads than cores.
 * Assume the NEURON users pc.nthread() is well chosen if
 * OMP_NUM_THREADS is not set.
 */
void set_openmp_threads(int nthread) {
#if defined(_OPENMP)
    if (!getenv("OMP_NUM_THREADS")) {
        omp_set_num_threads(nthread);
    }
#endif
}

/**
 * Convert char* containing arguments from neuron to char* argv[] for
 * coreneuron command line argument parser.
 */
char* prepare_args(int& argc, char**& argv, int use_mpi, const char* mpi_lib, const char* arg) {
    // first construct all arguments as string
    std::string args(arg);
    args.insert(0, " coreneuron ");
    args.append(" --skip-mpi-finalize ");
    if (use_mpi) {
        args.append(" --mpi ");
    }

    // if neuron has passed name of MPI library then add it to CLI
    std::string corenrn_mpi_lib{mpi_lib};
    if (!corenrn_mpi_lib.empty()) {
        args.append(" --mpi-lib ");
        corenrn_mpi_lib += " ";
        args.append(corenrn_mpi_lib);
    }

    // we can't modify string with strtok, make copy
    char* first = strdup(args.c_str());
    const char* sep = " ";

    // first count the no of argument
    char* token = strtok(first, sep);
    argc = 0;
    while (token) {
        token = strtok(nullptr, sep);
        argc++;
    }
    free(first);

    // now build char*argv
    argv = new char*[argc];
    first = strdup(args.c_str());
    token = strtok(first, sep);
    for (int i = 0; token; i++) {
        argv[i] = token;
        token = strtok(nullptr, sep);
    }

    // return actual data to be freed
    return first;
}
}

namespace coreneuron {
void call_prcellstate_for_prcellgid(int prcellgid, int compute_gpu, int is_init);

// bsize = 0 then per step transfer
// bsize > 1 then full trajectory save into arrays.
void get_nrn_trajectory_requests(int bsize) {
    if (nrn2core_get_trajectory_requests_) {
        for (int tid = 0; tid < nrn_nthread; ++tid) {
            NrnThread& nt = nrn_threads[tid];
            int n_pr;
            int n_trajec;
            int* types;
            int* indices;
            void** vpr;
            double** varrays;
            double** pvars;

            // bsize is passed by reference, the return value will determine if
            // per step return or entire trajectory return.
            (*nrn2core_get_trajectory_requests_)(
                tid, bsize, n_pr, vpr, n_trajec, types, indices, pvars, varrays);
            delete_trajectory_requests(nt);
            if (n_trajec) {
                TrajectoryRequests* tr = new TrajectoryRequests;
                nt.trajec_requests = tr;
                tr->bsize = bsize;
                tr->n_pr = n_pr;
                tr->n_trajec = n_trajec;
                tr->vsize = 0;
                tr->vpr = vpr;
                tr->gather = new double*[n_trajec];
                tr->varrays = varrays;
                tr->scatter = pvars;
                for (int i = 0; i < n_trajec; ++i) {
                    tr->gather[i] = stdindex2ptr(types[i], indices[i], nt);
                }
                delete[] types;
                delete[] indices;
            }
        }
    }
}

void nrn_init_and_load_data(int argc,
                            char* argv[],
                            CheckPoints& checkPoints,
                            bool is_mapping_needed,
                            bool run_setup_cleanup) {
#if defined(NRN_FEEXCEPT)
    nrn_feenableexcept();
#endif

    /// profiler like tau/vtune : do not measure from begining
    Instrumentor::stop_profile();

    // memory footprint after mpi initialisation
    if (!corenrn_param.is_quiet()) {
        report_mem_usage("After MPI_Init");
    }

    // initialise default coreneuron parameters
    initnrn();

    // set global variables
    // precedence is: set by user, globals.dat, 34.0
    celsius = corenrn_param.celsius;

#if CORENEURON_ENABLE_GPU
    if (!corenrn_param.gpu && corenrn_param.cell_interleave_permute == 2) {
        fprintf(stderr,
                "compiled with CORENEURON_ENABLE_GPU does not allow the combination of "
                "--cell-permute=2 and "
                "missing --gpu\n");
        exit(1);
    }
    if (!corenrn_param.gpu && corenrn_param.cuda_interface) {
        fprintf(stderr,
                "compiled with OpenACC/CUDA does not allow the combination of --cuda-interface and "
                "missing --gpu\n");
        exit(1);
    }
#endif

// if multi-threading enabled, make sure mpi library supports it
#if NRNMPI
    if (corenrn_param.mpi_enable && corenrn_param.threading) {
        nrnmpi_check_threading_support();
    }
#endif

    // full path of files.dat file
    std::string filesdat(corenrn_param.datpath + "/" + corenrn_param.filesdat);

    // read the global variable names and set their values from globals.dat
    set_globals(corenrn_param.datpath.c_str(), (corenrn_param.seed >= 0), corenrn_param.seed);

    // set global variables for start time, timestep and temperature
    if (!corenrn_embedded) {
        t = checkPoints.restore_time();
    }

    if (corenrn_param.dt != -1000.) {  // command line arg highest precedence
        dt = corenrn_param.dt;
    } else if (dt == -1000.) {  // not on command line and no dt in globals.dat
        dt = 0.025;             // lowest precedence
    }

    corenrn_param.dt = dt;

    rev_dt = (int) (1. / dt);

    if (corenrn_param.celsius != -1000.) {  // command line arg highest precedence
        celsius = corenrn_param.celsius;
    } else if (celsius == -1000.) {  // not on command line and no celsius in globals.dat
        celsius = 34.0;              // lowest precedence
    }

    corenrn_param.celsius = celsius;

    // create net_cvode instance
    mk_netcvode();

    // One part done before call to nrn_setup. Other part after.

    if (!corenrn_param.patternstim.empty()) {
        nrn_set_extra_thread0_vdata();
    }

    if (!corenrn_param.is_quiet()) {
        report_mem_usage("Before nrn_setup");
    }

    // set if need to interleave cells
    interleave_permute_type = corenrn_param.cell_interleave_permute;
    cellorder_nwarp = corenrn_param.nwarp;
    use_solve_interleave = corenrn_param.cell_interleave_permute;

    if (corenrn_param.gpu && interleave_permute_type == 0) {
        if (nrnmpi_myid == 0) {
            printf(
                " WARNING : GPU execution requires --cell-permute type 1 or 2. Setting it to 1.\n");
        }
        interleave_permute_type = 1;
        use_solve_interleave = true;
    }

    // multisend options
    use_multisend_ = corenrn_param.multisend ? 1 : 0;
    n_multisend_interval = corenrn_param.ms_subint;
    use_phase2_ = (corenrn_param.ms_phases == 2) ? 1 : 0;

    // reading *.dat files and setting up the data structures, setting mindelay
    nrn_setup(filesdat.c_str(),
              is_mapping_needed,
              checkPoints,
              run_setup_cleanup,
              corenrn_param.datpath.c_str(),
              checkPoints.get_restore_path().c_str(),
              &corenrn_param.mindelay);

    // Allgather spike compression and  bin queuing.
    nrn_use_bin_queue_ = corenrn_param.binqueue;
    int spkcompress = corenrn_param.spkcompress;
    nrnmpi_spike_compress(spkcompress, (spkcompress ? true : false), use_multisend_);

    if (!corenrn_param.is_quiet()) {
        report_mem_usage("After nrn_setup ");
    }

    // Invoke PatternStim
    if (!corenrn_param.patternstim.empty()) {
        nrn_mkPatternStim(corenrn_param.patternstim.c_str(), corenrn_param.tstop);
    }

    /// Setting the timeout
    nrn_set_timeout(200.);

    // show all configuration parameters for current run
    if (nrnmpi_myid == 0 && !corenrn_param.is_quiet()) {
        std::cout << corenrn_param << std::endl;
        std::cout << " Start time (t) = " << t << std::endl << std::endl;
    }

    // allocate buffer for mpi communication
    mk_spikevec_buffer(corenrn_param.spikebuf);

    if (!corenrn_param.is_quiet()) {
        report_mem_usage("After mk_spikevec_buffer");
    }

    // In direct mode there are likely trajectory record requests
    // to allow processing in NEURON after simulation by CoreNEURON
    if (corenrn_embedded) {
        // arg is additional vector size required (how many items will be
        // written to the double*) but NEURON can instead
        // specify that returns will be on a per time step basis.
        get_nrn_trajectory_requests(int((corenrn_param.tstop - t) / corenrn_param.dt) + 2);

        // In direct mode, CoreNEURON has exactly the behavior of
        // ParallelContext.psolve(tstop). Ie a sequence of such calls
        // without an intervening h.finitialize() continues from the end
        // of the previous call. I.e., all initial state, including
        // the event queue has been set up in NEURON. And, at the end
        // all final state, including the event queue will be sent back
        // to NEURON. Here there is some first time only
        // initialization and queue transfer.
        direct_mode_initialize();
        clear_spike_vectors();  // PreSyn send already recorded by NEURON
        (*nrn2core_part2_clean_)();
    }

    if (corenrn_param.gpu) {
        // Copy nrnthreads to device only after all the data are passed from NEURON and the
        // nrnthreads on CPU are properly set up
        setup_nrnthreads_on_device(nrn_threads, nrn_nthread);
    }

    if (corenrn_embedded) {
        // Run nrn_init of mechanisms only to allocate any extra data needed on the GPU after
        // nrnthreads are properly set up on the GPU
        allocate_data_in_mechanism_nrn_init();
    }

    if (corenrn_param.gpu) {
        if (nrn_have_gaps) {
            nrn_partrans::copy_gap_indices_to_device();
        }
    }

    // call prcellstate for prcellgid
    call_prcellstate_for_prcellgid(corenrn_param.prcellgid, corenrn_param.gpu, 1);
}

void call_prcellstate_for_prcellgid(int prcellgid, int compute_gpu, int is_init) {
    char prcellname[1024];
#ifdef ENABLE_CUDA
    const char* prprefix = "cu";
#else
    const char* prprefix = "acc";
#endif

    if (prcellgid >= 0) {
        if (compute_gpu) {
            if (is_init)
                sprintf(prcellname, "%s_gpu_init", prprefix);
            else
                sprintf(prcellname, "%s_gpu_t%f", prprefix, t);
        } else {
            if (is_init)
                strcpy(prcellname, "cpu_init");
            else
                sprintf(prcellname, "cpu_t%f", t);
        }
        update_nrnthreads_on_host(nrn_threads, nrn_nthread);
        prcellstate(prcellgid, prcellname);
    }
}

/* perform forwardskip and call prcellstate for prcellgid */
void handle_forward_skip(double forwardskip, int prcellgid) {
    double savedt = dt;
    double savet = t;

    dt = forwardskip * 0.1;
    t = -1e9;
    dt2thread(-1.);

    for (int step = 0; step < 10; ++step) {
        nrn_fixed_step_minimal();
    }

    if (prcellgid >= 0) {
        prcellstate(prcellgid, "fs");
    }

    dt = savedt;
    t = savet;
    dt2thread(-1.);

    // clear spikes generated during forward skip (with negative time)
    clear_spike_vectors();
}

std::string cnrn_version() {
    return version::to_string();
}


static void trajectory_return() {
    if (nrn2core_trajectory_return_) {
        for (int tid = 0; tid < nrn_nthread; ++tid) {
            NrnThread& nt = nrn_threads[tid];
            TrajectoryRequests* tr = nt.trajec_requests;
            if (tr && tr->varrays) {
                (*nrn2core_trajectory_return_)(tid, tr->n_pr, tr->bsize, tr->vsize, tr->vpr, nt._t);
            }
        }
    }
}

std::unique_ptr<ReportHandler> create_report_handler(const ReportConfiguration& config,
                                                     const SpikesInfo& spikes_info) {
    std::unique_ptr<ReportHandler> report_handler;
    if (config.format == "Bin") {
        report_handler = std::make_unique<BinaryReportHandler>();
    } else if (config.format == "SONATA") {
        report_handler = std::make_unique<SonataReportHandler>(spikes_info);
    } else {
        if (nrnmpi_myid == 0) {
            printf(" WARNING : Report name '%s' has unknown format: '%s'.\n",
                   config.name.data(),
                   config.format.data());
        }
        return nullptr;
    }
    return report_handler;
}

}  // namespace coreneuron

/// The following high-level functions are marked as "extern C"
/// for compat with C, namely Neuron mod files.
/// They split the previous solve_core so that intermediate init of external mechanisms can occur.
/// See mech/corenrnmech.cpp for the new all-in-one solve_core (not compiled into the coreneuron
/// lib since with nrnivmodl-core we have 'future' external mechanisms)

using namespace coreneuron;

#if NRNMPI && defined(CORENEURON_ENABLE_MPI_DYNAMIC)
static void* load_dynamic_mpi(const std::string& libname) {
    dlerror();
    void* handle = dlopen(libname.c_str(), RTLD_NOW | RTLD_GLOBAL);
    const char* error = dlerror();
    if (error) {
        std::string err_msg = std::string("Could not open dynamic MPI library: ") + error + "\n";
        throw std::runtime_error(err_msg);
    }
    return handle;
}
#endif

extern "C" void mk_mech_init(int argc, char** argv) {
    // reset all parameters to their default values
    corenrn_param.reset();

    // read command line parameters and parameter config files
    corenrn_param.parse(argc, argv);

#if NRNMPI
    if (corenrn_param.mpi_enable) {
#ifdef CORENEURON_ENABLE_MPI_DYNAMIC
        // coreneuron rely on neuron to detect mpi library distribution and
        // the name of the library itself. Make sure the library name is specified
        // via CLI option.
        if (corenrn_param.mpi_lib.empty()) {
            throw std::runtime_error(
                "For dynamic MPI support you must pass '--mpi-lib "
                "/path/libcorenrnmpi_<name>.<suffix>` argument!\n");
        }

        // neuron can call coreneuron multiple times and hence we do not
        // want to initialize/load mpi library multiple times
        static bool mpi_lib_loaded = false;
        if (!mpi_lib_loaded) {
            auto mpi_handle = load_dynamic_mpi(corenrn_param.mpi_lib);
            mpi_manager().resolve_symbols(mpi_handle);
            mpi_lib_loaded = true;
        }
#endif
        auto ret = nrnmpi_init(&argc, &argv, corenrn_param.is_quiet());
        nrnmpi_numprocs = ret.numprocs;
        nrnmpi_myid = ret.myid;
    }
#endif

#ifdef CORENEURON_ENABLE_GPU
    if (corenrn_param.gpu) {
        init_gpu();
        cnrn_target_copyin(&celsius);
        cnrn_target_copyin(&pi);
        cnrn_target_copyin(&secondorder);
        nrnran123_initialise_global_state_on_device();
    }
#endif

    if (!corenrn_param.writeParametersFilepath.empty()) {
        std::ofstream out(corenrn_param.writeParametersFilepath, std::ios::trunc);
        out << corenrn_param.config_to_str(false, false);
        out.close();
    }

    // reads mechanism information from bbcore_mech.dat
    mk_mech((corenrn_param.datpath).c_str());
}

extern "C" int run_solve_core(int argc, char** argv) {
    Instrumentor::phase_begin("main");

    std::vector<ReportConfiguration> configs;
    std::vector<std::unique_ptr<ReportHandler>> report_handlers;
    SpikesInfo spikes_info;
    bool reports_needs_finalize = false;

    if (!corenrn_param.is_quiet()) {
        report_mem_usage("After mk_mech");
    }

    // Create outpath if it does not exist
    if (nrnmpi_myid == 0) {
        mkdir_p(corenrn_param.outpath.c_str());
    }

    if (!corenrn_param.reportfilepath.empty()) {
        configs = create_report_configurations(corenrn_param.reportfilepath,
                                               corenrn_param.outpath,
                                               spikes_info);
        reports_needs_finalize = !configs.empty();
    }

    CheckPoints checkPoints{corenrn_param.checkpointpath, corenrn_param.restorepath};

    // initializationa and loading functions moved to separate
    {
        Instrumentor::phase p("load-model");
        nrn_init_and_load_data(argc, argv, checkPoints, !configs.empty());
    }

    std::string output_dir = corenrn_param.outpath;

    if (nrnmpi_myid == 0) {
        mkdir_p(output_dir.c_str());
    }
#if NRNMPI
    if (corenrn_param.mpi_enable) {
        nrnmpi_barrier();
    }
#endif
    bool compute_gpu = corenrn_param.gpu;

    nrn_pragma_acc(update device(celsius, secondorder, pi) if (compute_gpu))
    nrn_pragma_omp(target update to(celsius, secondorder, pi) if (compute_gpu))
    {
        double v = corenrn_param.voltage;
        double dt = corenrn_param.dt;
        double delay = corenrn_param.mindelay;
        double tstop = corenrn_param.tstop;

        if (tstop < t && nrnmpi_myid == 0) {
            printf("Error: Stop time (%lf) < Start time (%lf), restoring from checkpoint? \n",
                   tstop,
                   t);
            abort();
        }

        // TODO : if some ranks are empty then restore will go in deadlock
        // phase (as some ranks won't have restored anything and hence return
        // false in checkpoint_initialize
        if (!corenrn_embedded && !checkPoints.initialize()) {
            nrn_finitialize(v != 1000., v);
        }

        if (!corenrn_param.is_quiet()) {
            report_mem_usage("After nrn_finitialize");
        }

        // register all reports into reportinglib
        double min_report_dt = INT_MAX;
        for (size_t i = 0; i < configs.size(); i++) {
            std::unique_ptr<ReportHandler> report_handler = create_report_handler(configs[i],
                                                                                  spikes_info);
            if (report_handler) {
                report_handler->create_report(configs[i], dt, tstop, delay);
                report_handlers.push_back(std::move(report_handler));
            }
            if (configs[i].report_dt < min_report_dt) {
                min_report_dt = configs[i].report_dt;
            }
        }
        // Set the buffer size if is not the default value. Otherwise use report.conf on
        // register_report
        if (corenrn_param.report_buff_size != corenrn_param.report_buff_size_default) {
            set_report_buffer_size(corenrn_param.report_buff_size);
        }

        if (!configs.empty()) {
            setup_report_engine(min_report_dt, delay);
            configs.clear();
        }

        // call prcellstate for prcellgid
        call_prcellstate_for_prcellgid(corenrn_param.prcellgid, compute_gpu, 0);

        // handle forwardskip
        if (corenrn_param.forwardskip > 0.0) {
            Instrumentor::phase p("handle-forward-skip");
            handle_forward_skip(corenrn_param.forwardskip, corenrn_param.prcellgid);
        }

        /// Solver execution
        Instrumentor::start_profile();
        Instrumentor::phase_begin("simulation");
        BBS_netpar_solve(corenrn_param.tstop);
        Instrumentor::phase_end("simulation");
        Instrumentor::stop_profile();

        // update cpu copy of NrnThread from GPU
        update_nrnthreads_on_host(nrn_threads, nrn_nthread);

        // direct mode and full trajectory gathering on CoreNEURON, send back.
        if (corenrn_embedded) {
            trajectory_return();
        }

        // Report global cell statistics
        if (!corenrn_param.is_quiet()) {
            report_cell_stats();
        }

        // prcellstate after end of solver
        call_prcellstate_for_prcellgid(corenrn_param.prcellgid, compute_gpu, 0);
    }

    // write spike information to outpath
    {
        Instrumentor::phase p("output-spike");
        output_spikes(output_dir.c_str(), spikes_info);
    }

    // copy weights back to NEURON NetCon
    if (nrn2core_all_weights_return_) {
        // first update weights from gpu
        update_weights_from_gpu(nrn_threads, nrn_nthread);

        // store weight pointers
        std::vector<double*> weights(nrn_nthread, nullptr);

        // could be one thread more (empty) than in NEURON but does not matter
        for (int i = 0; i < nrn_nthread; ++i) {
            weights[i] = nrn_threads[i].weights;
        }
        (*nrn2core_all_weights_return_)(weights);
    }

    core2nrn_data_return();

    {
        Instrumentor::phase p("checkpoint");
        checkPoints.write_checkpoint(nrn_threads, nrn_nthread);
    }

    // must be done after checkpoint (to avoid deleting events)
    if (reports_needs_finalize) {
        finalize_report();
    }

    // cleanup threads on GPU
    if (corenrn_param.gpu) {
        delete_nrnthreads_on_device(nrn_threads, nrn_nthread);
        if (nrn_have_gaps) {
            nrn_partrans::delete_gap_indices_from_device();
        }
        nrnran123_destroy_global_state_on_device();
        cnrn_target_delete(&secondorder);
        cnrn_target_delete(&pi);
        cnrn_target_delete(&celsius);
    }

    // Cleaning the memory
    nrn_cleanup();

    // tau needs to resume profile
    Instrumentor::start_profile();

// mpi finalize
#if NRNMPI
    if (corenrn_param.mpi_enable && !corenrn_param.skip_mpi_finalize) {
        nrnmpi_finalize();
    }
#endif

    Instrumentor::phase_end("main");

    return 0;
}


================================================
FILE: coreneuron/config/config.cpp.in
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#include "coreneuron/config/config.h"

/// Git version of the project
const std::string coreneuron::version::GIT_REVISION = "@CN_GIT_REVISION@";

/// CoreNEURON version
const std::string coreneuron::version::CORENEURON_VERSION = "@CN_PROJECT_VERSION@";


================================================
FILE: coreneuron/config/config.h
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#pragma once

/**
 * \dir
 * \brief Global project configurations
 *
 * \file
 * \brief Version information
 */

#include <string>

namespace coreneuron {

/**
 * \brief Project version information
 */
struct version {
    /// git revision id
    static const std::string GIT_REVISION;

    /// project tagged version in the cmake
    static const std::string CORENEURON_VERSION;

    /// return version string (version + git id) as a string
    static std::string to_string() {
        return CORENEURON_VERSION + " " + GIT_REVISION;
    }
};

}  // namespace coreneuron


================================================
FILE: coreneuron/config/neuron_version.hpp.in
================================================
/*
# =============================================================================
# Copyright (c) 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/
#pragma once

// This is the CoreNEURON analogue of nrnsemanticversion.h in NEURON. Hopefully
// the duplication can go away soon.
#define NRN_VERSION_MAJOR @NRN_VERSION_MAJOR@
#define NRN_VERSION_MINOR @NRN_VERSION_MINOR@
#define NRN_VERSION_PATCH @NRN_VERSION_PATCH@


================================================
FILE: coreneuron/config/version_macros.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/
#pragma once

// This is the CoreNEURON analogue of nrnversionmacros.h in NEURON. Hopefully
// the duplication can go away soon.
#include "coreneuron/config/neuron_version.hpp"
#define NRN_VERSION_INT(maj, min, pat)  (10000 * maj + 100 * min + pat)
#define NRN_VERSION                     NRN_VERSION_INT(NRN_VERSION_MAJOR, NRN_VERSION_MINOR, NRN_VERSION_PATCH)
#define NRN_VERSION_EQ(maj, min, pat)   (NRN_VERSION == NRN_VERSION_INT(maj, min, pat))
#define NRN_VERSION_NE(maj, min, pat)   (NRN_VERSION != NRN_VERSION_INT(maj, min, pat))
#define NRN_VERSION_GT(maj, min, pat)   (NRN_VERSION > NRN_VERSION_INT(maj, min, pat))
#define NRN_VERSION_LT(maj, min, pat)   (NRN_VERSION < NRN_VERSION_INT(maj, min, pat))
#define NRN_VERSION_GTEQ(maj, min, pat) (NRN_VERSION >= NRN_VERSION_INT(maj, min, pat))
#define NRN_VERSION_LTEQ(maj, min, pat) (NRN_VERSION <= NRN_VERSION_INT(maj, min, pat))

// 8.2.0 is significant because all versions >=8.2.0 should contain definitions
// of these macros, and doing #ifndef NRN_VERSION_GTEQ_8_2_0 is a more
// descriptive way of writing #if defined(NRN_VERSION_GTEQ). Testing for 8.2.0
// is likely to be a common pattern when adapting MOD file VERBATIM blocks for
// C++ compatibility.
#if NRN_VERSION_GTEQ(8, 2, 0)
#define NRN_VERSION_GTEQ_8_2_0
#endif


================================================
FILE: coreneuron/coreneuron.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/
#pragma once

/***
 * Includes all headers required to communicate and run all methods
 * described in CoreNEURON, neurox, and mod2c C-generated mechanisms
 * functions.
 **/


#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <string.h>
#include <vector>
#include <array>

#include "coreneuron/utils/randoms/nrnran123.h"     //Random Number Generator
#include "coreneuron/sim/scopmath/newton_struct.h"  //Newton Struct
#include "coreneuron/membrane_definitions.h"        //static definitions
#include "coreneuron/mechanism/mechanism.hpp"       //Memb_list and mechs info

#include "coreneuron/utils/memory.h"  //Memory alignments and padding
#include "coreneuron/nrnconf.h"
#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/mechanism/mech_mapping.hpp"

namespace coreneuron {

// from nrnoc/capac.c
extern void nrn_init_capacitance(NrnThread*, Memb_list*, int);
extern void nrn_cur_capacitance(NrnThread* _nt, Memb_list* ml, int type);
extern void nrn_alloc_capacitance(double* data, Datum* pdata, int type);

// from nrnoc/eion.c
extern void nrn_init_ion(NrnThread*, Memb_list*, int);
extern void nrn_cur_ion(NrnThread* _nt, Memb_list* ml, int type);
extern void nrn_alloc_ion(double* data, Datum* pdata, int type);
extern void second_order_cur(NrnThread* _nt, int secondorder);

using DependencyTable = std::vector<std::vector<int>>;

/**
 * A class representing the CoreNEURON state, holding pointers to the various data structures
 *
 * The pointers to "global" data such as the NrnThread, Memb_list and Memb_func data structures
 * are managed here. they logically share their lifetime and runtime scope with instances of
 * this class.
 */
class CoreNeuron {
    /**
     * map if mech is a point process
     * In the future only a field of Mechanism class
     */
    std::vector<char> pnt_map; /* so prop_free can know its a point mech*/

    /** Vector mapping the types (IDs) of different mechanisms of mod files between NEURON and
     * CoreNEURON
     */
    std::vector<int> different_mechanism_type;

    /**
     * dependency helper filled by calls to hoc_register_dparam_semantics
     * used when nrn_mech_depend is called
     * vector-of-vector DS. First idx is the mech, second idx is the dependent mech.
     */
    DependencyTable ion_write_dependency;

    std::vector<Memb_func> memb_funcs;

    /**
     * Net send / Net receive
     * only used in CoreNEURON for book keeping synapse mechs, should go into CoreNEURON class
     */
    std::vector<std::pair<NetBufReceive_t, int>> net_buf_receive;
    std::vector<int> net_buf_send_type;

    /**
     * before-after-blocks from nmodl are registered here as function pointers
     */
    std::array<BAMech*, BEFORE_AFTER_SIZE> bamech;

    /**
     * Internal lookup tables. Number of float and int variables in each mechanism and memory layout
     * future --> mech class
     */
    std::vector<int> nrn_prop_param_size;
    std::vector<int> nrn_prop_dparam_size;
    std::vector<int> nrn_mech_data_layout; /* 1 AoS (default), 0 SoA */
    /* array is parallel to memb_func. All are 0 except 1 for ARTIFICIAL_CELL */
    std::vector<short> nrn_artcell_qindex;
    std::vector<bool> nrn_is_artificial;

    /**
     * Net Receive function pointer lookup tables
     */
    std::vector<pnt_receive_t> pnt_receive; /* for synaptic events. */
    std::vector<pnt_receive_t> pnt_receive_init;
    std::vector<short> pnt_receive_size;

    /**
     * Holds function pointers for WATCH callback
     */
    std::vector<nrn_watch_check_t> nrn_watch_check;

    /**
     * values are type numbers of mechanisms which do net_send call
     * related to NMODL net_event()
     *
     */
    std::vector<int> nrn_has_net_event;

    /**
     * inverse of nrn_has_net_event_ maps the values of nrn_has_net_event_ to the index of
     * ptntype2presyn
     */
    std::vector<int> pnttype2presyn;


    std::vector<bbcore_read_t> nrn_bbcore_read;
    std::vector<bbcore_write_t> nrn_bbcore_write;

  public:
    auto& get_memb_funcs() {
        return memb_funcs;
    }

    auto& get_memb_func(size_t idx) {
        return memb_funcs[idx];
    }

    auto& get_different_mechanism_type() {
        return different_mechanism_type;
    }

    auto& get_pnt_map() {
        return pnt_map;
    }

    auto& get_ion_write_dependency() {
        return ion_write_dependency;
    }

    auto& get_net_buf_receive() {
        return net_buf_receive;
    }

    auto& get_net_buf_send_type() {
        return net_buf_send_type;
    }

    auto& get_bamech() {
        return bamech;
    }

    auto& get_prop_param_size() {
        return nrn_prop_param_size;
    }

    auto& get_prop_dparam_size() {
        return nrn_prop_dparam_size;
    }

    auto& get_mech_data_layout() {
        return nrn_mech_data_layout;
    }

    auto& get_is_artificial() {
        return nrn_is_artificial;
    }

    auto& get_artcell_qindex() {
        return nrn_artcell_qindex;
    }

    auto& get_pnt_receive() {
        return pnt_receive;
    }

    auto& get_pnt_receive_init() {
        return pnt_receive_init;
    }

    auto& get_pnt_receive_size() {
        return pnt_receive_size;
    }

    auto& get_watch_check() {
        return nrn_watch_check;
    }

    auto& get_has_net_event() {
        return nrn_has_net_event;
    }

    auto& get_pnttype2presyn() {
        return pnttype2presyn;
    }

    auto& get_bbcore_read() {
        return nrn_bbcore_read;
    }

    auto& get_bbcore_write() {
        return nrn_bbcore_write;
    }
};

extern CoreNeuron corenrn;

}  // namespace coreneuron


================================================
FILE: coreneuron/engine.h.in
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#pragma once

// Use MAJOR.MINOR for public version
#define CORENEURON_VERSION @CORENEURON_VERSION_COMBINED@

#ifdef __cplusplus
extern "C" {
#endif

/// All-in-one initialization of mechanisms and solver
extern int solve_core(int argc, char** argv);

/// Initialize mechanisms
extern void mk_mech_init(int argc, char** argv);
/// Run core solver
extern int run_solve_core(int argc, char** argv);

#ifdef __cplusplus
}
#endif


================================================
FILE: coreneuron/gpu/nrn_acc_manager.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#include <queue>
#include <utility>

#include "coreneuron/apps/corenrn_parameters.hpp"
#include "coreneuron/gpu/nrn_acc_manager.hpp"
#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/network/netcon.hpp"
#include "coreneuron/nrniv/nrniv_decl.h"
#include "coreneuron/utils/vrecitem.h"
#include "coreneuron/utils/profile/profiler_interface.h"
#include "coreneuron/permute/cellorder.hpp"
#include "coreneuron/permute/data_layout.hpp"
#include "coreneuron/sim/scopmath/newton_struct.h"
#include "coreneuron/coreneuron.hpp"
#include "coreneuron/utils/nrnoc_aux.hpp"
#include "coreneuron/mpi/nrnmpidec.h"
#include "coreneuron/utils/utils.hpp"

#ifdef CRAYPAT
#include <pat_api.h>
#endif

#if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
#include <cuda_runtime_api.h>
#endif

#if __has_include(<cxxabi.h>)
#define USE_CXXABI
#include <cxxabi.h>
#include <memory>
#include <string>
#endif

#ifdef CORENEURON_ENABLE_PRESENT_TABLE
#include <cassert>
#include <cstddef>
#include <iostream>
#include <map>
#include <shared_mutex>
namespace {
struct present_table_value {
    std::size_t ref_count{}, size{};
    std::byte* dev_ptr{};
};
std::map<std::byte const*, present_table_value> present_table;
std::shared_mutex present_table_mutex;
}  // namespace
#endif

namespace {
/** @brief Try to demangle a type name, return the mangled name on failure.
 */
std::string cxx_demangle(const char* mangled) {
#ifdef USE_CXXABI
    int status{};
    // Note that the third argument to abi::__cxa_demangle returns the length of
    // the allocated buffer, which may be larger than strlen(demangled) + 1.
    std::unique_ptr<char, decltype(free)*> demangled{
        abi::__cxa_demangle(mangled, nullptr, nullptr, &status), free};
    return status ? mangled : demangled.get();
#else
    return mangled;
#endif
}
bool cnrn_target_debug_output_enabled() {
    const char* env = std::getenv("CORENEURON_GPU_DEBUG");
    if (!env) {
        return false;
    }
    std::string env_s{env};
    if (env_s == "1") {
        return true;
    } else if (env_s == "0") {
        return false;
    } else {
        throw std::runtime_error("CORENEURON_GPU_DEBUG must be set to 0 or 1 (got " + env_s + ")");
    }
}
bool cnrn_target_enable_debug{cnrn_target_debug_output_enabled()};
}  // namespace

namespace coreneuron {
extern InterleaveInfo* interleave_info;
void nrn_ion_global_map_copyto_device();
void nrn_ion_global_map_delete_from_device();
void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay);
void nrn_VecPlay_delete_from_device(NrnThread* nt);

void cnrn_target_copyin_debug(std::string_view file,
                              int line,
                              std::size_t sizeof_T,
                              std::type_info const& typeid_T,
                              void const* h_ptr,
                              std::size_t len,
                              void* d_ptr) {
    if (!cnrn_target_enable_debug) {
        return;
    }
    std::cerr << file << ':' << line << ": cnrn_target_copyin<" << cxx_demangle(typeid_T.name())
              << ">(" << h_ptr << ", " << len << " * " << sizeof_T << " = " << len * sizeof_T
              << ") -> " << d_ptr << std::endl;
}
void cnrn_target_delete_debug(std::string_view file,
                              int line,
                              std::size_t sizeof_T,
                              std::type_info const& typeid_T,
                              void const* h_ptr,
                              std::size_t len) {
    if (!cnrn_target_enable_debug) {
        return;
    }
    std::cerr << file << ':' << line << ": cnrn_target_delete<" << cxx_demangle(typeid_T.name())
              << ">(" << h_ptr << ", " << len << " * " << sizeof_T << " = " << len * sizeof_T << ')'
              << std::endl;
}
void cnrn_target_deviceptr_debug(std::string_view file,
                                 int line,
                                 std::type_info const& typeid_T,
                                 void const* h_ptr,
                                 void* d_ptr) {
    if (!cnrn_target_enable_debug) {
        return;
    }
    std::cerr << file << ':' << line << ": cnrn_target_deviceptr<" << cxx_demangle(typeid_T.name())
              << ">(" << h_ptr << ") -> " << d_ptr << std::endl;
}
void cnrn_target_is_present_debug(std::string_view file,
                                  int line,
                                  std::type_info const& typeid_T,
                                  void const* h_ptr,
                                  void* d_ptr) {
    if (!cnrn_target_enable_debug) {
        return;
    }
    std::cerr << file << ':' << line << ": cnrn_target_is_present<" << cxx_demangle(typeid_T.name())
              << ">(" << h_ptr << ") -> " << d_ptr << std::endl;
}
void cnrn_target_memcpy_to_device_debug(std::string_view file,
                                        int line,
                                        std::size_t sizeof_T,
                                        std::type_info const& typeid_T,
                                        void const* h_ptr,
                                        std::size_t len,
                                        void* d_ptr) {
    if (!cnrn_target_enable_debug) {
        return;
    }
    std::cerr << file << ':' << line << ": cnrn_target_memcpy_to_device<"
              << cxx_demangle(typeid_T.name()) << ">(" << d_ptr << ", " << h_ptr << ", " << len
              << " * " << sizeof_T << " = " << len * sizeof_T << ')' << std::endl;
}

#ifdef CORENEURON_ENABLE_PRESENT_TABLE
std::pair<void*, bool> cnrn_target_deviceptr_impl(bool must_be_present_or_null, void const* h_ptr) {
    if (!h_ptr) {
        return {nullptr, false};
    }
    // Concurrent calls to this method are safe, but they must be serialised
    // w.r.t. calls to the cnrn_target_*_update_present_table methods.
    std::shared_lock _{present_table_mutex};
    if (present_table.empty()) {
        return {nullptr, must_be_present_or_null};
    }
    // prev(first iterator greater than h_ptr or last if not found) gives the first iterator less
    // than or equal to h_ptr
    auto const iter = std::prev(std::upper_bound(
        present_table.begin(), present_table.end(), h_ptr, [](void const* hp, auto const& entry) {
            return hp < entry.first;
        }));
    if (iter == present_table.end()) {
        return {nullptr, must_be_present_or_null};
    }
    std::byte const* const h_byte_ptr{static_cast<std::byte const*>(h_ptr)};
    std::byte const* const h_start_of_block{iter->first};
    std::size_t const block_size{iter->second.size};
    std::byte* const d_start_of_block{iter->second.dev_ptr};
    bool const is_present{h_byte_ptr < h_start_of_block + block_size};
    if (!is_present) {
        return {nullptr, must_be_present_or_null};
    }
    return {d_start_of_block + (h_byte_ptr - h_start_of_block), false};
}

void cnrn_target_copyin_update_present_table(void const* h_ptr, void* d_ptr, std::size_t len) {
    if (!h_ptr) {
        assert(!d_ptr);
        return;
    }
    std::lock_guard _{present_table_mutex};
    // TODO include more pedantic overlap checking?
    present_table_value new_val{};
    new_val.size = len;
    new_val.ref_count = 1;
    new_val.dev_ptr = static_cast<std::byte*>(d_ptr);
    auto const [iter, inserted] = present_table.emplace(static_cast<std::byte const*>(h_ptr),
                                                        std::move(new_val));
    if (!inserted) {
        // Insertion didn't occur because h_ptr was already in the present table
        assert(iter->second.size == len);
        assert(iter->second.dev_ptr == new_val.dev_ptr);
        ++(iter->second.ref_count);
    }
}
void cnrn_target_delete_update_present_table(void const* h_ptr, std::size_t len) {
    if (!h_ptr) {
        return;
    }
    std::lock_guard _{present_table_mutex};
    auto const iter = present_table.find(static_cast<std::byte const*>(h_ptr));
    assert(iter != present_table.end());
    assert(iter->second.size == len);
    --(iter->second.ref_count);
    if (iter->second.ref_count == 0) {
        present_table.erase(iter);
    }
}
#endif

int cnrn_target_get_num_devices() {
#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
    defined(_OPENACC)
    // choose nvidia GPU by default
    acc_device_t device_type = acc_device_nvidia;
    // check how many gpu devices available per node
    return acc_get_num_devices(device_type);
#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
    defined(_OPENMP)
    return omp_get_num_devices();
#else
    throw std::runtime_error(
        "cnrn_target_get_num_devices() not implemented without OpenACC/OpenMP and gpu build");
#endif
}

void cnrn_target_set_default_device(int device_num) {
#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
    defined(_OPENACC)
    acc_set_device_num(device_num, acc_device_nvidia);
#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
    defined(_OPENMP)
    omp_set_default_device(device_num);
    // It seems that with NVHPC 21.9 then only setting the default OpenMP device
    // is not enough: there were errors on some nodes when not-the-0th GPU was
    // used. These seemed to be related to the NMODL instance structs, which are
    // allocated using cudaMallocManaged.
    auto const cuda_code = cudaSetDevice(device_num);
    assert(cuda_code == cudaSuccess);
#else
    throw std::runtime_error(
        "cnrn_target_set_default_device() not implemented without OpenACC/OpenMP and gpu build");
#endif
}

#ifdef CORENEURON_ENABLE_GPU
#ifndef CORENEURON_UNIFIED_MEMORY
static Memb_list* copy_ml_to_device(const Memb_list* ml, int type) {
    // As we never run code for artificial cell inside GPU we don't copy it.
    int is_art = corenrn.get_is_artificial()[type];
    if (is_art) {
        return nullptr;
    }

    auto d_ml = cnrn_target_copyin(ml);

    if (ml->global_variables) {
        assert(ml->global_variables_size);
        void* d_inst = cnrn_target_copyin(static_cast<std::byte*>(ml->global_variables),
                                          ml->global_variables_size);
        cnrn_target_memcpy_to_device(&(d_ml->global_variables), &d_inst);
    }


    int n = ml->nodecount;
    int szp = corenrn.get_prop_param_size()[type];
    int szdp = corenrn.get_prop_dparam_size()[type];

    double* dptr = cnrn_target_deviceptr(ml->data);
    cnrn_target_memcpy_to_device(&(d_ml->data), &(dptr));


    int* d_nodeindices = cnrn_target_copyin(ml->nodeindices, n);
    cnrn_target_memcpy_to_device(&(d_ml->nodeindices), &d_nodeindices);

    if (szdp) {
        int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
        int* d_pdata = cnrn_target_copyin(ml->pdata, pcnt);
        cnrn_target_memcpy_to_device(&(d_ml->pdata), &d_pdata);
    }

    int ts = corenrn.get_memb_funcs()[type].thread_size_;
    if (ts) {
        ThreadDatum* td = cnrn_target_copyin(ml->_thread, ts);
        cnrn_target_memcpy_to_device(&(d_ml->_thread), &td);
    }

    // net_receive buffer associated with mechanism
    NetReceiveBuffer_t* nrb = ml->_net_receive_buffer;

    // if net receive buffer exist for mechanism
    if (nrb) {
        NetReceiveBuffer_t* d_nrb = cnrn_target_copyin(nrb);
        cnrn_target_memcpy_to_device(&(d_ml->_net_receive_buffer), &d_nrb);

        int* d_pnt_index = cnrn_target_copyin(nrb->_pnt_index, nrb->_size);
        cnrn_target_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index);

        int* d_weight_index = cnrn_target_copyin(nrb->_weight_index, nrb->_size);
        cnrn_target_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index);

        double* d_nrb_t = cnrn_target_copyin(nrb->_nrb_t, nrb->_size);
        cnrn_target_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t);

        double* d_nrb_flag = cnrn_target_copyin(nrb->_nrb_flag, nrb->_size);
        cnrn_target_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag);

        int* d_displ = cnrn_target_copyin(nrb->_displ, nrb->_size + 1);
        cnrn_target_memcpy_to_device(&(d_nrb->_displ), &d_displ);

        int* d_nrb_index = cnrn_target_copyin(nrb->_nrb_index, nrb->_size);
        cnrn_target_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index);
    }

    /* copy NetSendBuffer_t on to GPU */
    NetSendBuffer_t* nsb = ml->_net_send_buffer;

    if (nsb) {
        NetSendBuffer_t* d_nsb;
        int* d_iptr;
        double* d_dptr;

        d_nsb = cnrn_target_copyin(nsb);
        cnrn_target_memcpy_to_device(&(d_ml->_net_send_buffer), &d_nsb);

        d_iptr = cnrn_target_copyin(nsb->_sendtype, nsb->_size);
        cnrn_target_memcpy_to_device(&(d_nsb->_sendtype), &d_iptr);

        d_iptr = cnrn_target_copyin(nsb->_vdata_index, nsb->_size);
        cnrn_target_memcpy_to_device(&(d_nsb->_vdata_index), &d_iptr);

        d_iptr = cnrn_target_copyin(nsb->_pnt_index, nsb->_size);
        cnrn_target_memcpy_to_device(&(d_nsb->_pnt_index), &d_iptr);

        d_iptr = cnrn_target_copyin(nsb->_weight_index, nsb->_size);
        cnrn_target_memcpy_to_device(&(d_nsb->_weight_index), &d_iptr);

        d_dptr = cnrn_target_copyin(nsb->_nsb_t, nsb->_size);
        cnrn_target_memcpy_to_device(&(d_nsb->_nsb_t), &d_dptr);

        d_dptr = cnrn_target_copyin(nsb->_nsb_flag, nsb->_size);
        cnrn_target_memcpy_to_device(&(d_nsb->_nsb_flag), &d_dptr);
    }

    return d_ml;
}
#endif

static void update_ml_on_host(const Memb_list* ml, int type) {
    int is_art = corenrn.get_is_artificial()[type];
    if (is_art) {
        // Artificial mechanisms such as PatternStim and IntervalFire
        // are not copied onto the GPU. They should not, therefore, be
        // updated from the GPU.
        return;
    }

    int n = ml->nodecount;
    int szp = corenrn.get_prop_param_size()[type];
    int szdp = corenrn.get_prop_dparam_size()[type];

    int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szp;

    nrn_pragma_acc(update self(ml->data[:pcnt], ml->nodeindices[:n]))
    nrn_pragma_omp(target update from(ml->data[:pcnt], ml->nodeindices[:n]))

    int dpcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
    nrn_pragma_acc(update self(ml->pdata[:dpcnt]) if (szdp))
    nrn_pragma_omp(target update from(ml->pdata[:dpcnt]) if (szdp))

    auto nrb = ml->_net_receive_buffer;

    // clang-format off
    nrn_pragma_acc(update self(nrb->_cnt,
                               nrb->_size,
                               nrb->_pnt_offset,
                               nrb->_displ_cnt,
                               nrb->_pnt_index[:nrb->_size],
                               nrb->_weight_index[:nrb->_size],
                               nrb->_displ[:nrb->_size + 1],
                               nrb->_nrb_index[:nrb->_size])
                          if (nrb != nullptr))
    nrn_pragma_omp(target update from(nrb->_cnt,
                                      nrb->_size,
                                      nrb->_pnt_offset,
                                      nrb->_displ_cnt,
                                      nrb->_pnt_index[:nrb->_size],
                                      nrb->_weight_index[:nrb->_size],
                                      nrb->_displ[:nrb->_size + 1],
                                      nrb->_nrb_index[:nrb->_size])
                                 if (nrb != nullptr))
    // clang-format on
}

static void delete_ml_from_device(Memb_list* ml, int type) {
    int is_art = corenrn.get_is_artificial()[type];
    if (is_art) {
        return;
    }
    // Cleanup the net send buffer if it exists
    {
        NetSendBuffer_t* nsb{ml->_net_send_buffer};
        if (nsb) {
            cnrn_target_delete(nsb->_nsb_flag, nsb->_size);
            cnrn_target_delete(nsb->_nsb_t, nsb->_size);
            cnrn_target_delete(nsb->_weight_index, nsb->_size);
            cnrn_target_delete(nsb->_pnt_index, nsb->_size);
            cnrn_target_delete(nsb->_vdata_index, nsb->_size);
            cnrn_target_delete(nsb->_sendtype, nsb->_size);
            cnrn_target_delete(nsb);
        }
    }
    // Cleanup the net receive buffer if it exists.
    {
        NetReceiveBuffer_t* nrb{ml->_net_receive_buffer};
        if (nrb) {
            cnrn_target_delete(nrb->_nrb_index, nrb->_size);
            cnrn_target_delete(nrb->_displ, nrb->_size + 1);
            cnrn_target_delete(nrb->_nrb_flag, nrb->_size);
            cnrn_target_delete(nrb->_nrb_t, nrb->_size);
            cnrn_target_delete(nrb->_weight_index, nrb->_size);
            cnrn_target_delete(nrb->_pnt_index, nrb->_size);
            cnrn_target_delete(nrb);
        }
    }
    int n = ml->nodecount;
    int szdp = corenrn.get_prop_dparam_size()[type];
    int ts = corenrn.get_memb_funcs()[type].thread_size_;
    if (ts) {
        cnrn_target_delete(ml->_thread, ts);
    }
    if (szdp) {
        int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
        cnrn_target_delete(ml->pdata, pcnt);
    }
    cnrn_target_delete(ml->nodeindices, n);

    if (ml->global_variables) {
        assert(ml->global_variables_size);
        cnrn_target_delete(static_cast<std::byte*>(ml->global_variables),
                           ml->global_variables_size);
    }

    cnrn_target_delete(ml);
}

#endif

/* note: threads here are corresponding to global nrn_threads array */
void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
#ifdef CORENEURON_ENABLE_GPU
    // initialize NrnThreads for gpu execution
    // empty thread or only artificial cells should be on cpu
    for (int i = 0; i < nthreads; i++) {
        NrnThread* nt = threads + i;
        nt->compute_gpu = (nt->end > 0) ? 1 : 0;
        nt->_dt = dt;
    }

    nrn_ion_global_map_copyto_device();

#ifdef CORENEURON_UNIFIED_MEMORY
    for (int i = 0; i < nthreads; i++) {
        NrnThread* nt = threads + i;  // NrnThread on host

        if (nt->n_presyn) {
            PreSyn* d_presyns = cnrn_target_copyin(nt->presyns, nt->n_presyn);
        }

        if (nt->n_vecplay) {
            /* copy VecPlayContinuous instances */
            /** just empty containers */
            void** d_vecplay = cnrn_target_copyin(nt->_vecplay, nt->n_vecplay);
            // note: we are using unified memory for NrnThread. Once VecPlay is copied to gpu,
            // we dont want to update nt->vecplay because it will also set gpu pointer of vecplay
            // inside nt on cpu (due to unified memory).

            nrn_VecPlay_copyto_device(nt, d_vecplay);
        }

        if (!nt->_permute && nt->end > 0) {
            printf("\n WARNING: NrnThread %d not permuted, error for linear algebra?", i);
        }
    }

#else
    /* -- copy NrnThread to device. this needs to be contigious vector because offset is used to
     * find
     * corresponding NrnThread using Point_process in NET_RECEIVE block
     */
    NrnThread* d_threads = cnrn_target_copyin(threads, nthreads);

    if (interleave_info == nullptr) {
        printf("\n Warning: No permutation data? Required for linear algebra!");
    }

    /* pointers for data struct on device, starting with d_ */

    for (int i = 0; i < nthreads; i++) {
        NrnThread* nt = threads + i;      // NrnThread on host
        NrnThread* d_nt = d_threads + i;  // NrnThread on device
        if (!nt->compute_gpu) {
            continue;
        }
        double* d__data;  // nrn_threads->_data on device

        /* -- copy _data to device -- */

        /*copy all double data for thread */
        d__data = cnrn_target_copyin(nt->_data, nt->_ndata);


        /* Here is the example of using OpenACC data enter/exit
         * Remember that we are not allowed to use nt->_data but we have to use:
         *      double *dtmp = nt->_data;  // now use dtmp!
                #pragma acc enter data copyin(dtmp[0:nt->_ndata]) async(nt->stream_id)
                #pragma acc wait(nt->stream_id)
         */

        /*update d_nt._data to point to device copy */
        cnrn_target_memcpy_to_device(&(d_nt->_data), &d__data);

        /* -- setup rhs, d, a, b, v, node_aread to point to device copy -- */
        double* dptr;

        /* for padding, we have to recompute ne */
        int ne = nrn_soa_padded_size(nt->end, 0);

        dptr = d__data + 0 * ne;
        cnrn_target_memcpy_to_device(&(d_nt->_actual_rhs), &(dptr));

        dptr = d__data + 1 * ne;
        cnrn_target_memcpy_to_device(&(d_nt->_actual_d), &(dptr));

        dptr = d__data + 2 * ne;
        cnrn_target_memcpy_to_device(&(d_nt->_actual_a), &(dptr));

        dptr = d__data + 3 * ne;
        cnrn_target_memcpy_to_device(&(d_nt->_actual_b), &(dptr));

        dptr = d__data + 4 * ne;
        cnrn_target_memcpy_to_device(&(d_nt->_actual_v), &(dptr));

        dptr = d__data + 5 * ne;
        cnrn_target_memcpy_to_device(&(d_nt->_actual_area), &(dptr));

        if (nt->_actual_diam) {
            dptr = d__data + 6 * ne;
            cnrn_target_memcpy_to_device(&(d_nt->_actual_diam), &(dptr));
        }

        int* d_v_parent_index = cnrn_target_copyin(nt->_v_parent_index, nt->end);
        cnrn_target_memcpy_to_device(&(d_nt->_v_parent_index), &(d_v_parent_index));

        /* nt._ml_list is used in NET_RECEIVE block and should have valid membrane list id*/
        Memb_list** d_ml_list = cnrn_target_copyin(nt->_ml_list, corenrn.get_memb_funcs().size());
        cnrn_target_memcpy_to_device(&(d_nt->_ml_list), &(d_ml_list));

        /* -- copy NrnThreadMembList list ml to device -- */

        NrnThreadMembList* d_last_tml;

        bool first_tml = true;

        for (auto tml = nt->tml; tml; tml = tml->next) {
            /*copy tml to device*/
            /*QUESTIONS: does tml will point to nullptr as in host ? : I assume so!*/
            auto d_tml = cnrn_target_copyin(tml);

            /*first tml is pointed by nt */
            if (first_tml) {
                cnrn_target_memcpy_to_device(&(d_nt->tml), &d_tml);
                first_tml = false;
            } else {
                /*rest of tml forms linked list */
                cnrn_target_memcpy_to_device(&(d_last_tml->next), &d_tml);
            }

            // book keeping for linked-list
            d_last_tml = d_tml;

            /* now for every tml, there is a ml. copy that and setup pointer */
            Memb_list* d_ml = copy_ml_to_device(tml->ml, tml->index);
            cnrn_target_memcpy_to_device(&(d_tml->ml), &d_ml);
            /* setup nt._ml_list */
            cnrn_target_memcpy_to_device(&(d_ml_list[tml->index]), &d_ml);
        }

        if (nt->shadow_rhs_cnt) {
            double* d_shadow_ptr;

            int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);

            /* copy shadow_rhs to device and fix-up the pointer */
            d_shadow_ptr = cnrn_target_copyin(nt->_shadow_rhs, pcnt);
            cnrn_target_memcpy_to_device(&(d_nt->_shadow_rhs), &d_shadow_ptr);

            /* copy shadow_d to device and fix-up the pointer */
            d_shadow_ptr = cnrn_target_copyin(nt->_shadow_d, pcnt);
            cnrn_target_memcpy_to_device(&(d_nt->_shadow_d), &d_shadow_ptr);
        }

        /* Fast membrane current calculation struct */
        if (nt->nrn_fast_imem) {
            NrnFastImem* d_fast_imem = cnrn_target_copyin(nt->nrn_fast_imem);
            cnrn_target_memcpy_to_device(&(d_nt->nrn_fast_imem), &d_fast_imem);
            {
                double* d_ptr = cnrn_target_copyin(nt->nrn_fast_imem->nrn_sav_rhs, nt->end);
                cnrn_target_memcpy_to_device(&(d_fast_imem->nrn_sav_rhs), &d_ptr);
            }
            {
                double* d_ptr = cnrn_target_copyin(nt->nrn_fast_imem->nrn_sav_d, nt->end);
                cnrn_target_memcpy_to_device(&(d_fast_imem->nrn_sav_d), &d_ptr);
            }
        }

        if (nt->n_pntproc) {
            /* copy Point_processes array and fix the pointer to execute net_receive blocks on GPU
             */
            Point_process* pntptr = cnrn_target_copyin(nt->pntprocs, nt->n_pntproc);
            cnrn_target_memcpy_to_device(&(d_nt->pntprocs), &pntptr);
        }

        if (nt->n_weight) {
            /* copy weight vector used in NET_RECEIVE which is pointed by netcon.weight */
            double* d_weights = cnrn_target_copyin(nt->weights, nt->n_weight);
            cnrn_target_memcpy_to_device(&(d_nt->weights), &d_weights);
        }

        if (nt->_nvdata) {
            /* copy vdata which is setup in bbcore_read. This contains cuda allocated
             * nrnran123_State * */
            void** d_vdata = cnrn_target_copyin(nt->_vdata, nt->_nvdata);
            cnrn_target_memcpy_to_device(&(d_nt->_vdata), &d_vdata);
        }

        if (nt->n_presyn) {
            /* copy presyn vector used for spike exchange, note we have added new PreSynHelper due
             * to issue
             * while updating PreSyn objects which has virtual base class. May be this is issue due
             * to
             * VTable and alignment */
            PreSynHelper* d_presyns_helper = cnrn_target_copyin(nt->presyns_helper, nt->n_presyn);
            cnrn_target_memcpy_to_device(&(d_nt->presyns_helper), &d_presyns_helper);
            PreSyn* d_presyns = cnrn_target_copyin(nt->presyns, nt->n_presyn);
            cnrn_target_memcpy_to_device(&(d_nt->presyns), &d_presyns);
        }

        if (nt->_net_send_buffer_size) {
            /* copy send_receive buffer */
            int* d_net_send_buffer = cnrn_target_copyin(nt->_net_send_buffer,
                                                        nt->_net_send_buffer_size);
            cnrn_target_memcpy_to_device(&(d_nt->_net_send_buffer), &d_net_send_buffer);
        }

        if (nt->n_vecplay) {
            /* copy VecPlayContinuous instances */
            /** just empty containers */
            void** d_vecplay = cnrn_target_copyin(nt->_vecplay, nt->n_vecplay);
            cnrn_target_memcpy_to_device(&(d_nt->_vecplay), &d_vecplay);

            nrn_VecPlay_copyto_device(nt, d_vecplay);
        }

        if (nt->_permute) {
            if (interleave_permute_type == 1) {
                /* todo: not necessary to setup pointers, just copy it */
                InterleaveInfo* info = interleave_info + i;
                int* d_ptr = nullptr;
                InterleaveInfo* d_info = cnrn_target_copyin(info);

                d_ptr = cnrn_target_copyin(info->stride, info->nstride + 1);
                cnrn_target_memcpy_to_device(&(d_info->stride), &d_ptr);

                d_ptr = cnrn_target_copyin(info->firstnode, nt->ncell);
                cnrn_target_memcpy_to_device(&(d_info->firstnode), &d_ptr);

                d_ptr = cnrn_target_copyin(info->lastnode, nt->ncell);
                cnrn_target_memcpy_to_device(&(d_info->lastnode), &d_ptr);

                d_ptr = cnrn_target_copyin(info->cellsize, nt->ncell);
                cnrn_target_memcpy_to_device(&(d_info->cellsize), &d_ptr);

            } else if (interleave_permute_type == 2) {
                /* todo: not necessary to setup pointers, just copy it */
                InterleaveInfo* info = interleave_info + i;
                InterleaveInfo* d_info = cnrn_target_copyin(info);
                int* d_ptr = nullptr;

                d_ptr = cnrn_target_copyin(info->stride, info->nstride);
                cnrn_target_memcpy_to_device(&(d_info->stride), &d_ptr);

                d_ptr = cnrn_target_copyin(info->firstnode, info->nwarp + 1);
                cnrn_target_memcpy_to_device(&(d_info->firstnode), &d_ptr);

                d_ptr = cnrn_target_copyin(info->lastnode, info->nwarp + 1);
                cnrn_target_memcpy_to_device(&(d_info->lastnode), &d_ptr);

                d_ptr = cnrn_target_copyin(info->stridedispl, info->nwarp + 1);
                cnrn_target_memcpy_to_device(&(d_info->stridedispl), &d_ptr);

                d_ptr = cnrn_target_copyin(info->cellsize, info->nwarp);
                cnrn_target_memcpy_to_device(&(d_info->cellsize), &d_ptr);
            } else {
                printf("\n ERROR: only --cell_permute = [12] implemented");
                abort();
            }
        } else {
            printf("\n WARNING: NrnThread %d not permuted, error for linear algebra?", i);
        }

        {
            TrajectoryRequests* tr = nt->trajec_requests;
            if (tr) {
                // Create a device-side copy of the `trajec_requests` struct and
                // make sure the device-side NrnThread object knows about it.
                TrajectoryRequests* d_trajec_requests = cnrn_target_copyin(tr);
                cnrn_target_memcpy_to_device(&(d_nt->trajec_requests), &d_trajec_requests);
                // Initialise the double** gather member of the struct.
                double** d_tr_gather = cnrn_target_copyin(tr->gather, tr->n_trajec);
                cnrn_target_memcpy_to_device(&(d_trajec_requests->gather), &d_tr_gather);
                // Initialise the double** varrays member of the struct if it's
                // set.
                double** d_tr_varrays{nullptr};
                if (tr->varrays) {
                    d_tr_varrays = cnrn_target_copyin(tr->varrays, tr->n_trajec);
                    cnrn_target_memcpy_to_device(&(d_trajec_requests->varrays), &d_tr_varrays);
                }
                for (int i = 0; i < tr->n_trajec; ++i) {
                    if (tr->varrays) {
                        // tr->varrays[i] is a buffer of tr->bsize doubles on the host,
                        // make a device-side copy of it and store a pointer to it in
                        // the device-side version of tr->varrays.
                        double* d_buf_traj_i = cnrn_target_copyin(tr->varrays[i], tr->bsize);
                        cnrn_target_memcpy_to_device(&(d_tr_varrays[i]), &d_buf_traj_i);
                    }
                    // tr->gather[i] is a double* referring to (host) data in the
                    // (host) _data block
                    auto* d_gather_i = cnrn_target_deviceptr(tr->gather[i]);
                    cnrn_target_memcpy_to_device(&(d_tr_gather[i]), &d_gather_i);
                }
                // TODO: other `double** scatter` and `void** vpr` members of
                // the TrajectoryRequests struct are not copied to the device.
                // The `int vsize` member is updated during the simulation but
                // not kept up to date timestep-by-timestep on the device.
            }
        }
        {
            auto* d_fornetcon_perm_indices = cnrn_target_copyin(nt->_fornetcon_perm_indices,
                                                                nt->_fornetcon_perm_indices_size);
            cnrn_target_memcpy_to_device(&(d_nt->_fornetcon_perm_indices),
                                         &d_fornetcon_perm_indices);
        }
        {
            auto* d_fornetcon_weight_perm = cnrn_target_copyin(nt->_fornetcon_weight_perm,
                                                               nt->_fornetcon_weight_perm_size);
            cnrn_target_memcpy_to_device(&(d_nt->_fornetcon_weight_perm), &d_fornetcon_weight_perm);
        }
    }

#endif
#else
    (void) threads;
    (void) nthreads;
#endif
}

void copy_ivoc_vect_to_device(const IvocVect& from, IvocVect& to) {
#ifdef CORENEURON_ENABLE_GPU
    /// by default `to` is desitionation pointer on a device
    IvocVect* d_iv = &to;

    size_t n = from.size();
    if (n) {
        double* d_data = cnrn_target_copyin(from.data(), n);
        cnrn_target_memcpy_to_device(&(d_iv->data_), &d_data);
    }
#else
    (void) from;
    (void) to;
#endif
}

void delete_ivoc_vect_from_device(IvocVect& vec) {
#ifdef CORENEURON_ENABLE_GPU
    auto const n = vec.size();
    if (n) {
        cnrn_target_delete(vec.data(), n);
    }
#else
    static_cast<void>(vec);
#endif
}

void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml) {
    NetReceiveBuffer_t* nrb = ml->_net_receive_buffer;
    if (!nrb) {
        return;
    }

#ifdef CORENEURON_ENABLE_GPU
    if (nt->compute_gpu) {
        // free existing vectors in buffers on gpu
        cnrn_target_delete(nrb->_pnt_index, nrb->_size);
        cnrn_target_delete(nrb->_weight_index, nrb->_size);
        cnrn_target_delete(nrb->_nrb_t, nrb->_size);
        cnrn_target_delete(nrb->_nrb_flag, nrb->_size);
        cnrn_target_delete(nrb->_displ, nrb->_size + 1);
        cnrn_target_delete(nrb->_nrb_index, nrb->_size);
    }
#endif
    // Reallocate host buffers using ecalloc_align (as in phase2.cpp) and
    // free_memory (as in nrn_setup.cpp)
    auto const realloc = [old_size = nrb->_size, nrb](auto*& ptr, std::size_t extra_size = 0) {
        using T = std::remove_pointer_t<std::remove_reference_t<decltype(ptr)>>;
        static_assert(std::is_trivial<T>::value,
                      "Only trivially constructible and copiable types are supported.");
        static_assert(std::is_same<decltype(ptr), T*&>::value,
                      "ptr should be reference-to-pointer");
        auto* const new_data = static_cast<T*>(ecalloc_align((nrb->_size + extra_size), sizeof(T)));
        std::memcpy(new_data, ptr, (old_size + extra_size) * sizeof(T));
        free_memory(ptr);
        ptr = new_data;
    };
    nrb->_size *= 2;
    realloc(nrb->_pnt_index);
    realloc(nrb->_weight_index);
    realloc(nrb->_nrb_t);
    realloc(nrb->_nrb_flag);
    realloc(nrb->_displ, 1);
    realloc(nrb->_nrb_index);
#ifdef CORENEURON_ENABLE_GPU
    if (nt->compute_gpu) {
        // update device copy
        nrn_pragma_acc(update device(nrb));
        nrn_pragma_omp(target update to(nrb));

        NetReceiveBuffer_t* const d_nrb{cnrn_target_deviceptr(nrb)};
        // recopy the vectors in the buffer
        int* const d_pnt_index{cnrn_target_copyin(nrb->_pnt_index, nrb->_size)};
        cnrn_target_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index);

        int* const d_weight_index{cnrn_target_copyin(nrb->_weight_index, nrb->_size)};
        cnrn_target_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index);

        double* const d_nrb_t{cnrn_target_copyin(nrb->_nrb_t, nrb->_size)};
        cnrn_target_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t);

        double* const d_nrb_flag{cnrn_target_copyin(nrb->_nrb_flag, nrb->_size)};
        cnrn_target_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag);

        int* const d_displ{cnrn_target_copyin(nrb->_displ, nrb->_size + 1)};
        cnrn_target_memcpy_to_device(&(d_nrb->_displ), &d_displ);

        int* const d_nrb_index{cnrn_target_copyin(nrb->_nrb_index, nrb->_size)};
        cnrn_target_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index);
    }
#endif
}

using NRB_P = std::pair<int, int>;

struct comp {
    bool operator()(const NRB_P& a, const NRB_P& b) {
        if (a.first == b.first) {
            return a.second > b.second;  // same instances in original net_receive order
        }
        return a.first > b.first;
    }
};

static void net_receive_buffer_order(NetReceiveBuffer_t* nrb) {
    Instrumentor::phase p_net_receive_buffer_order("net-receive-buf-order");
    if (nrb->_cnt == 0) {
        nrb->_displ_cnt = 0;
        return;
    }

    std::priority_queue<NRB_P, std::vector<NRB_P>, comp> nrbq;

    for (int i = 0; i < nrb->_cnt; ++i) {
        nrbq.push(NRB_P(nrb->_pnt_index[i], i));
    }

    int displ_cnt = 0;
    int index_cnt = 0;
    int last_instance_index = -1;
    nrb->_displ[0] = 0;

    while (!nrbq.empty()) {
        const NRB_P& p = nrbq.top();
        nrb->_nrb_index[index_cnt++] = p.second;
        if (p.first != last_instance_index) {
            ++displ_cnt;
        }
        nrb->_displ[displ_cnt] = index_cnt;
        last_instance_index = p.first;
        nrbq.pop();
    }
    nrb->_displ_cnt = displ_cnt;
}

/* when we execute NET_RECEIVE block on GPU, we provide the index of synapse instances
 * which we need to execute during the current timestep. In order to do this, we have
 * update NetReceiveBuffer_t object to GPU. When size of cpu buffer changes, we set
 * reallocated to true and hence need to reallocate buffer on GPU and then need to copy
 * entire buffer. If reallocated is 0, that means buffer size is not changed and hence
 * only need to copy _size elements to GPU.
 * Note: this is very preliminary implementation, optimisations will be done after first
 * functional version.
 */
void update_net_receive_buffer(NrnThread* nt) {
    Instrumentor::phase p_update_net_receive_buffer("update-net-receive-buf");
    for (auto tml = nt->tml; tml; tml = tml->next) {
        int is_art = corenrn.get_is_artificial()[tml->index];
        if (is_art) {
            continue;
        }
        // net_receive buffer to copy
        NetReceiveBuffer_t* nrb = tml->ml->_net_receive_buffer;

        // if net receive buffer exist for mechanism
        if (nrb && nrb->_cnt) {
            // instance order to avoid race. setup _displ and _nrb_index
            net_receive_buffer_order(nrb);

            if (nt->compute_gpu) {
                Instrumentor::phase p_net_receive_buffer_order("net-receive-buf-cpu2gpu");
                // note that dont update nrb otherwise we lose pointers

                // clang-format off

                /* update scalar elements */
                nrn_pragma_acc(update device(nrb->_cnt,
                                             nrb->_displ_cnt,
                                             nrb->_pnt_index[:nrb->_cnt],
                                             nrb->_weight_index[:nrb->_cnt],
                                             nrb->_nrb_t[:nrb->_cnt],
                                             nrb->_nrb_flag[:nrb->_cnt],
                                             nrb->_displ[:nrb->_displ_cnt + 1],
                                             nrb->_nrb_index[:nrb->_cnt])
                                             async(nt->stream_id))
                nrn_pragma_omp(target update to(nrb->_cnt,
                                                nrb->_displ_cnt,
                                                nrb->_pnt_index[:nrb->_cnt],
                                                nrb->_weight_index[:nrb->_cnt],
                                                nrb->_nrb_t[:nrb->_cnt],
                                                nrb->_nrb_flag[:nrb->_cnt],
                                                nrb->_displ[:nrb->_displ_cnt + 1],
                                                nrb->_nrb_index[:nrb->_cnt]))
                // clang-format on
            }
        }
    }
    nrn_pragma_acc(wait(nt->stream_id))
}

void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) {
#ifdef CORENEURON_ENABLE_GPU
    if (!nt->compute_gpu)
        return;

    // check if nsb->_cnt was exceeded on GPU: as the buffer can not be increased
    // during gpu execution, we should just abort the execution.
    // \todo: this needs to be fixed with different memory allocation strategy
    if (nsb->_cnt > nsb->_size) {
        printf("ERROR: NetSendBuffer exceeded during GPU execution (rank %d)\n", nrnmpi_myid);
        nrn_abort(1);
    }

    if (nsb->_cnt) {
        Instrumentor::phase p_net_receive_buffer_order("net-send-buf-gpu2cpu");
    }
    // clang-format off
    nrn_pragma_acc(update self(nsb->_sendtype[:nsb->_cnt],
                               nsb->_vdata_index[:nsb->_cnt],
                               nsb->_pnt_index[:nsb->_cnt],
                               nsb->_weight_index[:nsb->_cnt],
                               nsb->_nsb_t[:nsb->_cnt],
                               nsb->_nsb_flag[:nsb->_cnt])
                          if (nsb->_cnt))
    nrn_pragma_omp(target update from(nsb->_sendtype[:nsb->_cnt],
                                      nsb->_vdata_index[:nsb->_cnt],
                                      nsb->_pnt_index[:nsb->_cnt],
                                      nsb->_weight_index[:nsb->_cnt],
                                      nsb->_nsb_t[:nsb->_cnt],
                                      nsb->_nsb_flag[:nsb->_cnt])
                                 if (nsb->_cnt))
    // clang-format on
#else
    (void) nt;
    (void) nsb;
#endif
}

void update_nrnthreads_on_host(NrnThread* threads, int nthreads) {
#ifdef CORENEURON_ENABLE_GPU

    for (int i = 0; i < nthreads; i++) {
        NrnThread* nt = threads + i;

        if (nt->compute_gpu && (nt->end > 0)) {
            /* -- copy data to host -- */

            int ne = nrn_soa_padded_size(nt->end, 0);

            // clang-format off
            nrn_pragma_acc(update self(nt->_actual_rhs[:ne],
                                       nt->_actual_d[:ne],
                                       nt->_actual_a[:ne],
                                       nt->_actual_b[:ne],
                                       nt->_actual_v[:ne],
                                       nt->_actual_area[:ne]))
            nrn_pragma_omp(target update from(nt->_actual_rhs[:ne],
                                              nt->_actual_d[:ne],
                                              nt->_actual_a[:ne],
                                              nt->_actual_b[:ne],
                                              nt->_actual_v[:ne],
                                              nt->_actual_area[:ne]))
            // clang-format on

            nrn_pragma_acc(update self(nt->_actual_diam[:ne]) if (nt->_actual_diam != nullptr))
            nrn_pragma_omp(
                target update from(nt->_actual_diam[:ne]) if (nt->_actual_diam != nullptr))

            /* @todo: nt._ml_list[tml->index] = tml->ml; */

            /* -- copy NrnThreadMembList list ml to host -- */
            for (auto tml = nt->tml; tml; tml = tml->next) {
                if (!corenrn.get_is_artificial()[tml->index]) {
                    nrn_pragma_acc(update self(tml->index, tml->ml->nodecount))
                    nrn_pragma_omp(target update from(tml->index, tml->ml->nodecount))
                }
                update_ml_on_host(tml->ml, tml->index);
            }

            int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
            /* copy shadow_rhs to host */
            /* copy shadow_d to host */
            nrn_pragma_acc(
                update self(nt->_shadow_rhs[:pcnt], nt->_shadow_d[:pcnt]) if (nt->shadow_rhs_cnt))
            nrn_pragma_omp(target update from(
                nt->_shadow_rhs[:pcnt], nt->_shadow_d[:pcnt]) if (nt->shadow_rhs_cnt))

            // clang-format off
            nrn_pragma_acc(update self(nt->nrn_fast_imem->nrn_sav_rhs[:nt->end],
                                       nt->nrn_fast_imem->nrn_sav_d[:nt->end])
                                  if (nt->nrn_fast_imem != nullptr))
            nrn_pragma_omp(target update from(nt->nrn_fast_imem->nrn_sav_rhs[:nt->end],
                                              nt->nrn_fast_imem->nrn_sav_d[:nt->end])
                                         if (nt->nrn_fast_imem != nullptr))
            // clang-format on

            nrn_pragma_acc(update self(nt->pntprocs[:nt->n_pntproc]) if (nt->n_pntproc))
            nrn_pragma_omp(target update from(nt->pntprocs[:nt->n_pntproc]) if (nt->n_pntproc))

            nrn_pragma_acc(update self(nt->weights[:nt->n_weight]) if (nt->n_weight))
            nrn_pragma_omp(target update from(nt->weights[:nt->n_weight]) if (nt->n_weight))

            nrn_pragma_acc(update self(
                nt->presyns_helper[:nt->n_presyn], nt->presyns[:nt->n_presyn]) if (nt->n_presyn))
            nrn_pragma_omp(target update from(
                nt->presyns_helper[:nt->n_presyn], nt->presyns[:nt->n_presyn]) if (nt->n_presyn))

            {
                TrajectoryRequests* tr = nt->trajec_requests;
                if (tr && tr->varrays) {
                    // The full buffers have `bsize` entries, but only `vsize`
                    // of them are valid.
                    for (int i = 0; i < tr->n_trajec; ++i) {
                        nrn_pragma_acc(update self(tr->varrays[i][:tr->vsize]))
                        nrn_pragma_omp(target update from(tr->varrays[i][:tr->vsize]))
                    }
                }
            }

            /* dont update vdata, its pointer array
               nrn_pragma_acc(update self(nt->_vdata[:nt->_nvdata) if nt->_nvdata)
               nrn_pragma_omp(target update from(nt->_vdata[:nt->_nvdata) if (nt->_nvdata))
             */
        }
    }
#else
    (void) threads;
    (void) nthreads;
#endif
}

/**
 * Copy weights from GPU to CPU
 *
 * User may record NetCon weights at the end of simulation.
 * For this purpose update weights of all NrnThread objects
 * from GPU to CPU.
 */
void update_weights_from_gpu(NrnThread* threads, int nthreads) {
#ifdef CORENEURON_ENABLE_GPU
    for (int i = 0; i < nthreads; i++) {
        NrnThread* nt = threads + i;
        size_t n_weight = nt->n_weight;
        if (nt->compute_gpu && n_weight > 0) {
            double* weights = nt->weights;
            nrn_pragma_acc(update host(weights [0:n_weight]))
            nrn_pragma_omp(target update from(weights [0:n_weight]))
        }
    }
#endif
}

/** Cleanup device memory that is being tracked by the OpenACC runtime.
 *
 *  This function painstakingly calls `cnrn_target_delete` in reverse order on all
 *  pointers that were passed to `cnrn_target_copyin` in `setup_nrnthreads_on_device`.
 *  This cleanup ensures that if the GPU is initialised multiple times from the
 *  same process then the OpenACC runtime will not be polluted with old
 *  pointers, which can cause errors. In particular if we do:
 *  @code
 *    {
 *      // ... some_ptr is dynamically allocated ...
 *      cnrn_target_copyin(some_ptr, some_size);
 *      // ... do some work ...
 *      // cnrn_target_delete(some_ptr);
 *      free(some_ptr);
 *    }
 *    {
 *      // ... same_ptr_again is dynamically allocated at the same address ...
 *      cnrn_target_copyin(same_ptr_again, some_other_size); // ERROR
 *    }
 *  @endcode
 *  the application will/may abort with an error such as:
 *    FATAL ERROR: variable in data clause is partially present on the device.
 *  The pattern above is typical of calling CoreNEURON on GPU multiple times in
 *  the same process.
 */
void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) {
#ifdef CORENEURON_ENABLE_GPU
    for (int i = 0; i < nthreads; i++) {
        NrnThread* nt = threads + i;
        if (!nt->compute_gpu) {
            continue;
        }
        cnrn_target_delete(nt->_fornetcon_weight_perm, nt->_fornetcon_weight_perm_size);
        cnrn_target_delete(nt->_fornetcon_perm_indices, nt->_fornetcon_perm_indices_size);
        {
            TrajectoryRequests* tr = nt->trajec_requests;
            if (tr) {
                if (tr->varrays) {
                    for (int i = 0; i < tr->n_trajec; ++i) {
                        cnrn_target_delete(tr->varrays[i], tr->bsize);
                    }
                    cnrn_target_delete(tr->varrays, tr->n_trajec);
                }
                cnrn_target_delete(tr->gather, tr->n_trajec);
                cnrn_target_delete(tr);
            }
        }
        if (nt->_permute) {
            if (interleave_permute_type == 1) {
                InterleaveInfo* info = interleave_info + i;
                cnrn_target_delete(info->cellsize, nt->ncell);
                cnrn_target_delete(info->lastnode, nt->ncell);
                cnrn_target_delete(info->firstnode, nt->ncell);
                cnrn_target_delete(info->stride, info->nstride + 1);
                cnrn_target_delete(info);
            } else if (interleave_permute_type == 2) {
                InterleaveInfo* info = interleave_info + i;
                cnrn_target_delete(info->cellsize, info->nwarp);
                cnrn_target_delete(info->stridedispl, info->nwarp + 1);
                cnrn_target_delete(info->lastnode, info->nwarp + 1);
                cnrn_target_delete(info->firstnode, info->nwarp + 1);
                cnrn_target_delete(info->stride, info->nstride);
                cnrn_target_delete(info);
            }
        }

        if (nt->n_vecplay) {
            nrn_VecPlay_delete_from_device(nt);
            cnrn_target_delete(nt->_vecplay, nt->n_vecplay);
        }

        // Cleanup send_receive buffer.
        if (nt->_net_send_buffer_size) {
            cnrn_target_delete(nt->_net_send_buffer, nt->_net_send_buffer_size);
        }

        if (nt->n_presyn) {
            cnrn_target_delete(nt->presyns, nt->n_presyn);
            cnrn_target_delete(nt->presyns_helper, nt->n_presyn);
        }

        // Cleanup data that's setup in bbcore_read.
        if (nt->_nvdata) {
            cnrn_target_delete(nt->_vdata, nt->_nvdata);
        }

        // Cleanup weight vector used in NET_RECEIVE
        if (nt->n_weight) {
            cnrn_target_delete(nt->weights, nt->n_weight);
        }

        // Cleanup point processes
        if (nt->n_pntproc) {
            cnrn_target_delete(nt->pntprocs, nt->n_pntproc);
        }

        if (nt->nrn_fast_imem) {
            cnrn_target_delete(nt->nrn_fast_imem->nrn_sav_d, nt->end);
            cnrn_target_delete(nt->nrn_fast_imem->nrn_sav_rhs, nt->end);
            cnrn_target_delete(nt->nrn_fast_imem);
        }

        if (nt->shadow_rhs_cnt) {
            int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
            cnrn_target_delete(nt->_shadow_d, pcnt);
            cnrn_target_delete(nt->_shadow_rhs, pcnt);
        }

        for (auto tml = nt->tml; tml; tml = tml->next) {
            delete_ml_from_device(tml->ml, tml->index);
            cnrn_target_delete(tml);
        }
        cnrn_target_delete(nt->_ml_list, corenrn.get_memb_funcs().size());
        cnrn_target_delete(nt->_v_parent_index, nt->end);
        cnrn_target_delete(nt->_data, nt->_ndata);
    }
    cnrn_target_delete(threads, nthreads);
    nrn_ion_global_map_delete_from_device();
#endif
}


void nrn_newtonspace_copyto_device(NewtonSpace* ns) {
#ifdef CORENEURON_ENABLE_GPU
    // FIXME this check needs to be tweaked if we ever want to run with a mix
    //       of CPU and GPU threads.
    if (nrn_threads[0].compute_gpu == 0) {
        return;
    }

    int n = ns->n * ns->n_instance;
    // actually, the values of double do not matter, only the  pointers.
    NewtonSpace* d_ns = cnrn_target_copyin(ns);

    double* pd;

    pd = cnrn_target_copyin(ns->delta_x, n);
    cnrn_target_memcpy_to_device(&(d_ns->delta_x), &pd);

    pd = cnrn_target_copyin(ns->high_value, n);
    cnrn_target_memcpy_to_device(&(d_ns->high_value), &pd);

    pd = cnrn_target_copyin(ns->low_value, n);
    cnrn_target_memcpy_to_device(&(d_ns->low_value), &pd);

    pd = cnrn_target_copyin(ns->rowmax, n);
    cnrn_target_memcpy_to_device(&(d_ns->rowmax), &pd);

    auto pint = cnrn_target_copyin(ns->perm, n);
    cnrn_target_memcpy_to_device(&(d_ns->perm), &pint);

    auto ppd = cnrn_target_copyin(ns->jacobian, ns->n);
    cnrn_target_memcpy_to_device(&(d_ns->jacobian), &ppd);

    // the actual jacobian doubles were allocated as a single array
    double* d_jacdat = cnrn_target_copyin(ns->jacobian[0], ns->n * n);

    for (int i = 0; i < ns->n; ++i) {
        pd = d_jacdat + i * n;
        cnrn_target_memcpy_to_device(&(ppd[i]), &pd);
    }
#endif
}

void nrn_newtonspace_delete_from_device(NewtonSpace* ns) {
#ifdef CORENEURON_ENABLE_GPU
    // FIXME this check needs to be tweaked if we ever want to run with a mix
    //       of CPU and GPU threads.
    if (nrn_threads[0].compute_gpu == 0) {
        return;
    }
    int n = ns->n * ns->n_instance;
    cnrn_target_delete(ns->jacobian[0], ns->n * n);
    cnrn_target_delete(ns->jacobian, ns->n);
    cnrn_target_delete(ns->perm, n);
    cnrn_target_delete(ns->rowmax, n);
    cnrn_target_delete(ns->low_value, n);
    cnrn_target_delete(ns->high_value, n);
    cnrn_target_delete(ns->delta_x, n);
    cnrn_target_delete(ns);
#endif
}

void nrn_sparseobj_copyto_device(SparseObj* so) {
#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_UNIFIED_MEMORY)
    // FIXME this check needs to be tweaked if we ever want to run with a mix
    //       of CPU and GPU threads.
    if (nrn_threads[0].compute_gpu == 0) {
        return;
    }

    unsigned n1 = so->neqn + 1;
    SparseObj* d_so = cnrn_target_copyin(so);
    // only pointer fields in SparseObj that need setting up are
    //   rowst, diag, rhs, ngetcall, coef_list
    // only pointer fields in Elm that need setting up are
    //   r_down, c_right, value
    // do not care about the Elm* ptr value, just the space.

    Elm** d_rowst = cnrn_target_copyin(so->rowst, n1);
    cnrn_target_memcpy_to_device(&(d_so->rowst), &d_rowst);

    Elm** d_diag = cnrn_target_copyin(so->diag, n1);
    cnrn_target_memcpy_to_device(&(d_so->diag), &d_diag);

    unsigned* pu = cnrn_target_copyin(so->ngetcall, so->_cntml_padded);
    cnrn_target_memcpy_to_device(&(d_so->ngetcall), &pu);

    double* pd = cnrn_target_copyin(so->rhs, n1 * so->_cntml_padded);
    cnrn_target_memcpy_to_device(&(d_so->rhs), &pd);

    double** d_coef_list = cnrn_target_copyin(so->coef_list, so->coef_list_size);
    cnrn_target_memcpy_to_device(&(d_so->coef_list), &d_coef_list);

    // Fill in relevant Elm pointer values

    for (unsigned irow = 1; irow < n1; ++irow) {
        for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) {
            Elm* pelm = cnrn_target_copyin(elm);

            if (elm == so->rowst[irow]) {
                cnrn_target_memcpy_to_device(&(d_rowst[irow]), &pelm);
            } else {
                Elm* d_e = cnrn_target_deviceptr(elm->c_left);
                cnrn_target_memcpy_to_device(&(pelm->c_left), &d_e);
            }

            if (elm->col == elm->row) {
                cnrn_target_memcpy_to_device(&(d_diag[irow]), &pelm);
            }

            if (irow > 1) {
                if (elm->r_up) {
                    Elm* d_e = cnrn_target_deviceptr(elm->r_up);
                    cnrn_target_memcpy_to_device(&(pelm->r_up), &d_e);
                }
            }

            pd = cnrn_target_copyin(elm->value, so->_cntml_padded);
            cnrn_target_memcpy_to_device(&(pelm->value), &pd);
        }
    }

    // visit all the Elm again and fill in pelm->r_down and pelm->c_left
    for (unsigned irow = 1; irow < n1; ++irow) {
        for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) {
            auto pelm = cnrn_target_deviceptr(elm);
            if (elm->r_down) {
                auto d_e = cnrn_target_deviceptr(elm->r_down);
                cnrn_target_memcpy_to_device(&(pelm->r_down), &d_e);
            }
            if (elm->c_right) {
                auto d_e = cnrn_target_deviceptr(elm->c_right);
                cnrn_target_memcpy_to_device(&(pelm->c_right), &d_e);
            }
        }
    }

    // Fill in the d_so->coef_list
    for (unsigned i = 0; i < so->coef_list_size; ++i) {
        pd = cnrn_target_deviceptr(so->coef_list[i]);
        cnrn_target_memcpy_to_device(&(d_coef_list[i]), &pd);
    }
#endif
}

void nrn_sparseobj_delete_from_device(SparseObj* so) {
#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_UNIFIED_MEMORY)
    // FIXME this check needs to be tweaked if we ever want to run with a mix
    //       of CPU and GPU threads.
    if (nrn_threads[0].compute_gpu == 0) {
        return;
    }
    unsigned n1 = so->neqn + 1;
    for (unsigned irow = 1; irow < n1; ++irow) {
        for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) {
            cnrn_target_delete(elm->value, so->_cntml_padded);
            cnrn_target_delete(elm);
        }
    }
    cnrn_target_delete(so->coef_list, so->coef_list_size);
    cnrn_target_delete(so->rhs, n1 * so->_cntml_padded);
    cnrn_target_delete(so->ngetcall, so->_cntml_padded);
    cnrn_target_delete(so->diag, n1);
    cnrn_target_delete(so->rowst, n1);
    cnrn_target_delete(so);
#endif
}

#ifdef CORENEURON_ENABLE_GPU

void nrn_ion_global_map_copyto_device() {
    if (nrn_ion_global_map_size) {
        double** d_data = cnrn_target_copyin(nrn_ion_global_map, nrn_ion_global_map_size);
        for (int j = 0; j < nrn_ion_global_map_size; j++) {
            if (nrn_ion_global_map[j]) {
                double* d_mechmap = cnrn_target_copyin(nrn_ion_global_map[j],
                                                       ion_global_map_member_size);
                cnrn_target_memcpy_to_device(&(d_data[j]), &d_mechmap);
            }
        }
    }
}

void nrn_ion_global_map_delete_from_device() {
    for (int j = 0; j < nrn_ion_global_map_size; j++) {
        if (nrn_ion_global_map[j]) {
            cnrn_target_delete(nrn_ion_global_map[j], ion_global_map_member_size);
        }
    }
    if (nrn_ion_global_map_size) {
        cnrn_target_delete(nrn_ion_global_map, nrn_ion_global_map_size);
    }
}

void init_gpu() {
    // check how many gpu devices available per node
    int num_devices_per_node = cnrn_target_get_num_devices();

    // if no gpu found, can't run on GPU
    if (num_devices_per_node == 0) {
        nrn_fatal_error("\n ERROR : Enabled GPU execution but couldn't find NVIDIA GPU!\n");
    }

    if (corenrn_param.num_gpus != 0) {
        if (corenrn_param.num_gpus > num_devices_per_node) {
            nrn_fatal_error("Fatal error: asking for '%d' GPUs per node but only '%d' available\n",
                            corenrn_param.num_gpus,
                            num_devices_per_node);
        } else {
            num_devices_per_node = corenrn_param.num_gpus;
        }
    }

    // get local rank within a node and assign specific gpu gpu for this node.
    // multiple threads within the node will use same device.
    int local_rank = 0;
    int local_size = 1;
#if NRNMPI
    if (corenrn_param.mpi_enable) {
        local_rank = nrnmpi_local_rank();
        local_size = nrnmpi_local_size();
    }
#endif

    cnrn_target_set_default_device(local_rank % num_devices_per_node);

    if (nrnmpi_myid == 0 && !corenrn_param.is_quiet()) {
        std::cout << " Info : " << num_devices_per_node << " GPUs shared by " << local_size
                  << " ranks per node\n";
    }
}

void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) {
    for (int i = 0; i < nt->n_vecplay; i++) {
        VecPlayContinuous* vecplay_instance = (VecPlayContinuous*) nt->_vecplay[i];

        /** just VecPlayContinuous object */
        VecPlayContinuous* d_vecplay_instance = cnrn_target_copyin(vecplay_instance);
        cnrn_target_memcpy_to_device((VecPlayContinuous**) (&(d_vecplay[i])), &d_vecplay_instance);

        /** copy y_, t_ and discon_indices_ */
        copy_ivoc_vect_to_device(vecplay_instance->y_, d_vecplay_instance->y_);
        copy_ivoc_vect_to_device(vecplay_instance->t_, d_vecplay_instance->t_);
        // OL211213: beware, the test suite does not currently include anything
        // with a non-null discon_indices_.
        if (vecplay_instance->discon_indices_) {
            IvocVect* d_discon_indices = cnrn_target_copyin(vecplay_instance->discon_indices_);
            cnrn_target_memcpy_to_device(&(d_vecplay_instance->discon_indices_), &d_discon_indices);
            copy_ivoc_vect_to_device(*(vecplay_instance->discon_indices_),
                                     *(d_vecplay_instance->discon_indices_));
        }

        /** copy PlayRecordEvent : todo: verify this */
        PlayRecordEvent* d_e_ = cnrn_target_copyin(vecplay_instance->e_);

        cnrn_target_memcpy_to_device(&(d_e_->plr_), (PlayRecord**) (&d_vecplay_instance));
        cnrn_target_memcpy_to_device(&(d_vecplay_instance->e_), &d_e_);

        /** copy pd_ : note that it's pointer inside ml->data and hence data itself is
         * already on GPU */
        double* d_pd_ = cnrn_target_deviceptr(vecplay_instance->pd_);
        cnrn_target_memcpy_to_device(&(d_vecplay_instance->pd_), &d_pd_);
    }
}

void nrn_VecPlay_delete_from_device(NrnThread* nt) {
    for (int i = 0; i < nt->n_vecplay; i++) {
        auto* vecplay_instance = static_cast<VecPlayContinuous*>(nt->_vecplay[i]);
        cnrn_target_delete(vecplay_instance->e_);
        if (vecplay_instance->discon_indices_) {
            delete_ivoc_vect_from_device(*(vecplay_instance->discon_indices_));
        }
        delete_ivoc_vect_from_device(vecplay_instance->t_);
        delete_ivoc_vect_from_device(vecplay_instance->y_);
        cnrn_target_delete(vecplay_instance);
    }
}

#endif
}  // namespace coreneuron


================================================
FILE: coreneuron/gpu/nrn_acc_manager.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/
#pragma once

namespace coreneuron {
struct Memb_list;
struct NrnThread;
struct NetSendBuffer_t;
void setup_nrnthreads_on_device(NrnThread* threads, int nthreads);
void delete_nrnthreads_on_device(NrnThread* threads, int nthreads);
void update_nrnthreads_on_host(NrnThread* threads, int nthreads);

void update_net_receive_buffer(NrnThread* _nt);

// Called by NModl
void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml);
void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb);

void update_weights_from_gpu(NrnThread* threads, int nthreads);
void init_gpu();
}  // namespace coreneuron


================================================
FILE: coreneuron/io/core2nrn_data_return.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#include <sstream>

#include "coreneuron/coreneuron.hpp"
#include "coreneuron/io/nrn2core_direct.h"
#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/nrniv/nrniv_decl.h"
#include "coreneuron/io/core2nrn_data_return.hpp"
#include "coreneuron/network/netcvode.hpp"
#include "coreneuron/permute/node_permute.h"
#include "coreneuron/utils/nrnoc_aux.hpp"
#include "coreneuron/utils/vrecitem.h"
#include "coreneuron/io/mem_layout_util.hpp"

/** @brief, Information from NEURON to help with copying data to NEURON.
 *  Info for copying voltage, i_membrane_, and mechanism data.
 *  See implementaton in
 *  nrn/src/nrniv/nrnbbcore_write.cpp:nrnthreads_type_return.
 *  Return is size of either the returned data pointer or the number
 *  of pointers in mdata. tid is the thread index.
 */
size_t (*nrn2core_type_return_)(int type, int tid, double*& data, double**& mdata);

/** @brief, Call NEURON mechanism bbcore_read.
 *  Inverse of bbcore_write for transfer from NEURON to CoreNEURON.
 *  Mostly for transferring back the nrnran123_State sequence so psolve can
 *  continue on NEURON side (or continue psolve on CoreNEURON).
 */
extern "C" {
int (*core2nrn_corepointer_mech_)(int tid,
                                  int type,
                                  int icnt,
                                  int dcnt,
                                  int* iArray,
                                  double* dArray);
}

namespace coreneuron {

/** @brief permuted array copied to unpermuted array
 *  If permute is NULL then just a copy
 */
static void inverse_permute_copy(size_t n, double* permuted_src, double* dest, int* permute) {
    if (permute) {
        for (size_t i = 0; i < n; ++i) {
            dest[i] = permuted_src[permute[i]];
        }
    } else {
        std::copy(permuted_src, permuted_src + n, dest);
    }
}

/** @brief SoA permuted mechanism data copied to unpermuted AoS data.
 *  dest is an array of n pointers to the beginning of each sz length array.
 *  src is a contiguous array of sz segments of size stride. The stride
 *  may be slightly greater than n for purposes of alignment.
 *  Each of the sz segments of src are permuted.
 */
static void soa2aos_inverse_permute_copy(size_t n,
                                         int sz,
                                         int stride,
                                         double* src,
                                         double** dest,
                                         int* permute) {
    // src is soa and permuted. dest is n pointers to sz doubles (aos).
    for (size_t instance = 0; instance < n; ++instance) {
        double* d = dest[instance];
        double* s = src + permute[instance];
        for (int i = 0; i < sz; ++i) {
            d[i] = s[i * stride];
        }
    }
}

/** @brief SoA unpermuted mechanism data copied to unpermuted AoS data.
 *  dest is an array of n pointers to the beginning of each sz length array.
 *  src is a contiguous array of sz segments of size stride. The stride
 *  may be slightly greater than n for purposes of alignment.
 *  Each of the sz segments of src have the same order as the n pointers
 *  of dest.
 */
static void soa2aos_unpermuted_copy(size_t n, int sz, int stride, double* src, double** dest) {
    // src is soa and permuted. dest is n pointers to sz doubles (aos).
    for (size_t instance = 0; instance < n; ++instance) {
        double* d = dest[instance];
        double* s = src + instance;
        for (int i = 0; i < sz; ++i) {
            d[i] = s[i * stride];
        }
    }
}

/** @brief AoS mechanism data copied to AoS data.
 *  dest is an array of n pointers to the beginning of each sz length array.
 *  src is a contiguous array of n segments of size sz.
 */
static void aos2aos_copy(size_t n, int sz, double* src, double** dest) {
    for (size_t instance = 0; instance < n; ++instance) {
        double* d = dest[instance];
        double* s = src + (instance * sz);
        std::copy(s, s + sz, d);
    }
}

/** @brief Copy back COREPOINTER info to NEURON
 */
static void core2nrn_corepointer(int tid, NrnThreadMembList* tml) {
    // Based on get_bbcore_write fragment in nrn_checkpoint.cpp
    int type = tml->index;
    if (!corenrn.get_bbcore_write()[type]) {
        return;
    }
    NrnThread& nt = nrn_threads[tid];
    Memb_list* ml = tml->ml;
    double* d = nullptr;
    Datum* pd = nullptr;
    int layout = corenrn.get_mech_data_layout()[type];
    int dsz = corenrn.get_prop_param_size()[type];
    int pdsz = corenrn.get_prop_dparam_size()[type];
    int aln_cntml = nrn_soa_padded_size(ml->nodecount, layout);

    int icnt = 0;
    int dcnt = 0;
    // data size and allocate
    for (int j = 0; j < ml->nodecount; ++j) {
        int jp = j;
        if (ml->_permute) {
            jp = ml->_permute[j];
        }
        d = ml->data + nrn_i_layout(jp, ml->nodecount, 0, dsz, layout);
        pd = ml->pdata + nrn_i_layout(jp, ml->nodecount, 0, pdsz, layout);
        (*corenrn.get_bbcore_write()[type])(
            nullptr, nullptr, &dcnt, &icnt, 0, aln_cntml, d, pd, ml->_thread, &nt, ml, 0.0);
    }

    std::unique_ptr<int[]> iArray;
    std::unique_ptr<double[]> dArray;
    if (icnt) {
        iArray.reset(new int[icnt]);
    }
    if (dcnt) {
        dArray.reset(new double[dcnt]);
    }
    icnt = dcnt = 0;
    for (int j = 0; j < ml->nodecount; j++) {
        int jp = j;

        if (ml->_permute) {
            jp = ml->_permute[j];
        }

        d = ml->data + nrn_i_layout(jp, ml->nodecount, 0, dsz, layout);
        pd = ml->pdata + nrn_i_layout(jp, ml->nodecount, 0, pdsz, layout);

        (*corenrn.get_bbcore_write()[type])(dArray.get(),
                                            iArray.get(),
                                            &dcnt,
                                            &icnt,
                                            0,
                                            aln_cntml,
                                            d,
                                            pd,
                                            ml->_thread,
                                            &nt,
                                            ml,
                                            0.0);
    }

    (*core2nrn_corepointer_mech_)(tid, type, icnt, dcnt, iArray.get(), dArray.get());
}

/** @brief Copy event queue and related state back to NEURON.
 */
static void core2nrn_tqueue(NrnThread&);

/** @brief Callback to clear NEURON thread queues.
    In particular need to initialize bin queues to the current time before
    transferring events.
 */
extern "C" {
void (*core2nrn_clear_queues_)(double t);
}

/** @brief All activated WATCH statements need activation on NEURON side.
 */
// vector in unpermuted Memb_list index order of vector of
// activated watch_index (the bool is whether it is above threshold).
using Core2NrnWatchInfoItem = std::vector<std::pair<int, bool>>;
using Core2NrnWatchInfo = std::vector<Core2NrnWatchInfoItem>;

extern "C" {
void (*core2nrn_watch_clear_)();
void (*core2nrn_watch_activate_)(int tid, int type, int watch_begin, Core2NrnWatchInfo&);
}

static void core2nrn_watch();

/** @brief VecPlay indices back to NEURON */
extern "C" {
void (*core2nrn_vecplay_)(int tid, int i_nrn, int last, int discon, int ubound);
void (*core2nrn_vecplay_events_)();
}

static void core2nrn_vecplay();

/** @brief copy data back to NEURON.
 *  Copies t, voltage, i_membrane_ if it used, and mechanism param data.
 *  Copies event queue and related state, e.g. WATCH, VecPlayContinuous.
 */
void core2nrn_data_return() {
    if (!nrn2core_type_return_) {
        return;
    }

    (*core2nrn_clear_queues_)(nrn_threads[0]._t);  // all threads at same time

    for (int tid = 0; tid < nrn_nthread; ++tid) {
        size_t n = 0;
        double* data = nullptr;
        double** mdata = nullptr;
        NrnThread& nt = nrn_threads[tid];

        n = (*nrn2core_type_return_)(0, tid, data, mdata);  // 0 means time
        if (n) {                                            // not the empty thread
            data[0] = nt._t;
        }

        if (nt.end) {  // transfer voltage and possibly i_membrane_
            n = (*nrn2core_type_return_)(voltage, tid, data, mdata);
            assert(n == size_t(nt.end) && data);
            inverse_permute_copy(n, nt._actual_v, data, nt._permute);

            if (nt.nrn_fast_imem) {
                n = (*nrn2core_type_return_)(i_membrane_, tid, data, mdata);
                assert(n == size_t(nt.end) && data);
                inverse_permute_copy(n, nt.nrn_fast_imem->nrn_sav_rhs, data, nt._permute);
            }
        }

        for (NrnThreadMembList* tml = nt.tml; tml; tml = tml->next) {
            int mtype = tml->index;
            Memb_list* ml = tml->ml;
            n = (*nrn2core_type_return_)(mtype, tid, data, mdata);
            assert(n == size_t(ml->nodecount) && mdata);
            if (n == 0) {
                continue;
            }
            // NEURON is AoS, CoreNEURON may be SoA and may be permuted.
            // On the NEURON side, the data is actually contiguous because of
            // cache_efficient, but that may not be the case for ARTIFICIAL_CELL.
            // For initial implementation simplicity, use the mdata info which gives
            // a double* for each param_size mech instance.
            int* permute = ml->_permute;
            double* cndat = ml->data;
            int layout = corenrn.get_mech_data_layout()[mtype];
            int sz = corenrn.get_prop_param_size()[mtype];
            if (layout == Layout::SoA) {
                int stride = ml->_nodecount_padded;
                if (permute) {
                    soa2aos_inverse_permute_copy(n, sz, stride, cndat, mdata, permute);
                } else {
                    soa2aos_unpermuted_copy(n, sz, stride, cndat, mdata);
                }
            } else { /* AoS */
                aos2aos_copy(n, sz, cndat, mdata);
            }

            core2nrn_corepointer(tid, tml);
        }

        // Copy the event queue and related state.
        core2nrn_tqueue(nt);
    }
    core2nrn_vecplay();
    core2nrn_watch();
}

/** @brief Callbacks into NEURON for WatchCondition.
 */
static void core2nrn_watch() {
    (*core2nrn_watch_clear_)();

    // much of the following nested iterations follows the
    // watch_activate_clear() function in sim/finitialize.cpp, though here
    // we iterate over nt._watch_types instead of nt.tml and then picking out
    // the WATCH relevant types with corenrn.get_watch_check().
    for (int tid = 0; tid < nrn_nthread; ++tid) {
        NrnThread& nt = nrn_threads[tid];
        if (nt._watch_types) {
            for (int i = 0; nt._watch_types[i] != 0; ++i) {
                int type = nt._watch_types[i];
                Memb_list& ml = *(nt._ml_list[type]);
                int nodecount = ml.nodecount;
                Core2NrnWatchInfo watch_info(ml.nodecount);
                int* permute = ml._permute;
                int* pdata = (int*) ml.pdata;
                int dparam_size = corenrn.get_prop_dparam_size()[type];
                int layout = corenrn.get_mech_data_layout()[type];
                int first, last;
                watch_datum_indices(type, first, last);
                int watch_begin = first;
                for (int iml = 0; iml < nodecount; ++iml) {
                    int iml_permute = permute ? permute[iml] : iml;
                    Core2NrnWatchInfoItem& wiv = watch_info[iml];
                    for (int ix = first; ix <= last; ++ix) {
                        int datum =
                            pdata[nrn_i_layout(iml_permute, nodecount, ix, dparam_size, layout)];
                        if (datum & 2) {  // activated
                            bool above_thresh = bool(datum & 1);
                            wiv.push_back(std::pair<int, bool>(ix, above_thresh));
                        }
                    }
                }
                (*core2nrn_watch_activate_)(tid, type, watch_begin, watch_info);
            }
        }
    }
}

/** @brief Transfer VecPlay indices to NEURON.
 */
void core2nrn_vecplay() {
    for (int tid = 0; tid < nrn_nthread; ++tid) {
        NrnThread& nt = nrn_threads[tid];
        std::vector<int> i_nrn;
        int ok = (*nrn2core_get_dat2_vecplay_)(tid, i_nrn);
        if (nt.n_vecplay) {
            assert(ok);
        }
        for (int i = 0; i < nt.n_vecplay; ++i) {
            VecPlayContinuous& vp = *((VecPlayContinuous*) nt._vecplay[i]);
            (*core2nrn_vecplay_)(tid,
                                 i_nrn[i],
                                 (int) vp.last_index_,
                                 (int) vp.discon_index_,
                                 (int) vp.ubound_index_);
        }
    }
    (*core2nrn_vecplay_events_)();
}

/** @brief Callbacks into NEURON for queue event types.
 */
extern "C" {
void (*core2nrn_NetCon_event_)(int tid, double td, size_t nc_index);

// must calculate netcon index from the weight index on this side
void (*core2nrn_SelfEvent_event_)(int tid,
                                  double td,
                                  int tar_type,
                                  int tar_index,
                                  double flag,
                                  size_t nc_index,
                                  int is_movable);
// the no weight case
void (*core2nrn_SelfEvent_event_noweight_)(int tid,
                                           double td,
                                           int tar_type,
                                           int tar_index,
                                           double flag,
                                           int is_movable);

// PreSyn.flag_ will be 1 if it has fired and the value it is watching
// is still greater than threshold. (Note, is 0 no matter what after
// finitialize so using a set to send back the flag explicitly for any
// that are 1. Although that is not really relevant in the core2nrn
// direction. To match up PreSyn on NEURON and CoreNEURON side, we use
// the (unpermuted) voltage index.
void (*core2nrn_PreSyn_flag_)(int tid, std::set<int> presyns_flag_true);
// Receive the PreSyn.flag_ == true voltage indices from the neuron side.
void (*nrn2core_transfer_PreSyn_flag_)(int tid, std::set<int>& presyns_flag_true);
}

static void core2nrn_PreSyn_flag(NrnThread& nt) {
    std::set<int> presyns_flag_true;
    std::unique_ptr<int[]> pinv_nt;
    if (nt._permute) {
        pinv_nt.reset(inverse_permute(nt._permute, nt.end));
    }
    for (int i = 0; i < nt.n_presyn; ++i) {
        PreSyn& ps = nt.presyns[i];
        PreSynHelper& psh = nt.presyns_helper[i];
        if (psh.flag_ && ps.thvar_index_ >= 0) {
            int index_v = pinv_nt ? pinv_nt[ps.thvar_index_] : ps.thvar_index_;
            presyns_flag_true.insert(index_v);
        }
    }
    // have to send even if empty so NEURON side can turn off all flag_
    (*core2nrn_PreSyn_flag_)(nt.id, presyns_flag_true);
}

void nrn2core_PreSyn_flag_receive(int tid) {
    NrnThread& nt = nrn_threads[tid];
    // turn off all the PreSyn.flag_ as they might have been turned off
    // on the NEURON side if NEURON integrated a bit.
    for (int i = 0; i < nt.n_presyn; ++i) {
        nt.presyns_helper[i].flag_ = 0;  // in case 1 from previous psolve
    }
    std::set<int> presyns_flag_true;
    (*nrn2core_transfer_PreSyn_flag_)(tid, presyns_flag_true);
    if (presyns_flag_true.empty()) {
        return;
    }
    std::unique_ptr<int[]> pinv_nt;
    if (nt._permute) {
        pinv_nt.reset(inverse_permute(nt._permute, nt.end));
    }
    for (int i = 0; i < nt.n_presyn; ++i) {
        PreSyn& ps = nt.presyns[i];
        PreSynHelper& psh = nt.presyns_helper[i];
        if (ps.thvar_index_ >= 0) {
            int index_v = pinv_nt ? pinv_nt[ps.thvar_index_] : ps.thvar_index_;
            if (presyns_flag_true.erase(index_v)) {
                psh.flag_ = 1;
                if (presyns_flag_true.empty()) {
                    break;
                }
            }
        }
    }
}

std::map<int, int*> type2invperm;

static void clear_inv_perm_for_selfevent_targets() {
    for (auto it: type2invperm) {
        delete[] it.second;
    }
    type2invperm.clear();
}


using SelfEventWeightMap = std::map<int, std::vector<TQItem*>>;

// return false unless q is pushed to sewm
static bool core2nrn_tqueue_item(TQItem* q, SelfEventWeightMap& sewm, NrnThread& nt) {
    DiscreteEvent* d = (DiscreteEvent*) q->data_;
    double td = q->t_;
    bool in_sewm = false;

    switch (d->type()) {
        case NetConType: {
            NetCon* nc = (NetCon*) d;
            assert(nc >= nt.netcons && (nc < (nt.netcons + nt.n_netcon)));
            size_t nc_index = nc - nt.netcons;
            (*core2nrn_NetCon_event_)(nt.id, td, nc_index);
            break;
        }
        case SelfEventType: {
            SelfEvent* se = (SelfEvent*) d;
            Point_process* pnt = se->target_;
            assert(pnt->_tid == nt.id);
            int tar_type = (int) pnt->_type;
            Memb_list* ml = nt._ml_list[tar_type];
            if (ml->_permute) {  // if permutation, then make inverse available
                // Doing this here because we don't know, in general, which
                // mechanisms use SelfEvent
                if (type2invperm.count(tar_type) == 0) {
                    type2invperm[tar_type] = inverse_permute(ml->_permute, ml->nodecount);
                }
            }
            double flag = se->flag_;
            TQItem** movable = (TQItem**) (se->movable_);
            int is_movable = (movable && *movable == q) ? 1 : 0;
            int weight_index = se->weight_index_;
            // the weight_index is useless on the NEURON side so we need
            // to convert that to NetCon index  and let the NEURON side
            // figure out the weight_index. To figure out the netcon_index
            // construct a {weight_index : [TQItem]} here for any
            // weight_index >= 0, otherwise send it NEURON now.
            if (weight_index >= 0) {
                // Potentially several SelfEvent TQItem* associated with
                // same weight index. More importantly, collect them all
                // so that we only need to iterate over the nt.netcons once
                sewm[weight_index].push_back(q);
                in_sewm = true;

            } else {
                int tar_index = pnt->_i_instance;  // correct for no permutation
                if (ml->_permute) {
                    tar_index = type2invperm[tar_type][tar_index];
                }
                (*core2nrn_SelfEvent_event_noweight_)(
                    nt.id, td, tar_type, tar_index, flag, is_movable);
                delete se;
            }
            break;
        }
        case PreSynType: {
            // nothing to transfer
            // `d` can be cast to PreSyn*
            break;
        }
        case NetParEventType: {
            // nothing to transfer
            break;
        }
        case PlayRecordEventType: {
            // nothing to transfer
            break;
        }
        default: {
            // In particular, InputPreSyn does not appear in tqueue as it
            // immediately fans out to NetCon.
            std::stringstream qetype;
            qetype << d->type();
            hoc_execerror("core2nrn_tqueue_item -> unimplemented queue event type:",
                          qetype.str().c_str());
            break;
        }
    }
    return in_sewm;
}

void core2nrn_tqueue(NrnThread& nt) {
    // VecPlayContinuous

    // PatternStim

    // nrn_checkpoint.cpp has:
    // Avoid extra spikes due to some presyn voltages above threshold

    // PreSyn.flag_ that are on
    core2nrn_PreSyn_flag(nt);

    // The items on the queue
    NetCvodeThreadData& ntd = net_cvode_instance->p[nt.id];
    // make sure all buffered interthread events are on the queue
    ntd.enqueue(net_cvode_instance, &nt);

    TQueue<QTYPE>* tqe = ntd.tqe_;
    TQItem* q;
    SelfEventWeightMap sewm;
    // TQItems from atomic_dq
    while ((q = tqe->atomic_dq(1e20)) != nullptr) {
        if (core2nrn_tqueue_item(q, sewm, nt) == false) {
            delete q;
        }
    }
    // TQitems from binq_
    for (q = tqe->binq_->first(); q; q = tqe->binq_->next(q)) {
        bool const result = core2nrn_tqueue_item(q, sewm, nt);
        assert(result == false);
    }

    // For self events with weight, find the NetCon index and send that
    // to NEURON.
    // If the SelfEventWeightMap approach (and the corresponding pattern
    // on the nrn2core side in NEURON) ends up being too expensive in space
    // or time, it would be possible to modify SelfEvent to use the NetCon
    // index instead of the weight index, and then directly determine the
    // NetCon within the core2nrn_tqueue_item function above and call
    // (*core2nrn_SelfEvent_event_) from there.
    if (!sewm.empty()) {
        for (int nc_index = 0; nc_index < nt.n_netcon; ++nc_index) {
            NetCon& nc = nt.netcons[nc_index];
            int weight_index = nc.u.weight_index_;
            auto search = sewm.find(weight_index);
            if (search != sewm.end()) {
                const auto& tqitems = search->second;
                for (auto q: tqitems) {
                    DiscreteEvent* d = (DiscreteEvent*) (q->data_);
                    double td = q->t_;
                    assert(d->type() == SelfEventType);
                    SelfEvent* se = (SelfEvent*) d;
                    int tar_type = se->target_->_type;
                    // Note that instead of getting tar_index from the permuted
                    // pnt->_i_instance here and for the noweight case above
                    // which then needs the possibly large inverse permutation
                    // vectors, it would save some space to use the unpermuted
                    // nt.pntprocs array along with a much shorter vector
                    // of type offsets.
                    int tar_index = se->target_->_i_instance;
                    if (nt._ml_list[tar_type]->_permute) {
                        tar_index = type2invperm[tar_type][tar_index];
                    }
                    double flag = se->flag_;
                    TQItem** movable = (TQItem**) (se->movable_);
                    int is_movable = (movable && *movable == q) ? 1 : 0;
                    (*core2nrn_SelfEvent_event_)(
                        nt.id, td, tar_type, tar_index, flag, nc_index, is_movable);
                    delete q;
                    delete se;
                }
            }
        }
    }

    clear_inv_perm_for_selfevent_targets();
}

}  // namespace coreneuron


================================================
FILE: coreneuron/io/core2nrn_data_return.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#pragma once

namespace coreneuron {

/** @brief Copies back to NEURON everything needed to analyze and continue simulation.
    I.e. voltage, i_membrane_, mechanism data, event queue, WATCH state,
    Play state, etc.
 */
extern void core2nrn_data_return();

/** @brief return first and last datum indices of WATCH statements
 */
extern void watch_datum_indices(int type, int& first, int& last);

}  // namespace coreneuron


================================================
FILE: coreneuron/io/file_utils.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#include <cstdio>
#include <cstring>
#include <cstdlib>
#include <sys/stat.h>
#include <errno.h>

#if defined(MINGW)
#define mkdir(dir_name, permission) _mkdir(dir_name)
#endif

/* adapted from : gist@jonathonreinhart/mkdir_p.c */
int mkdir_p(const char* path) {
    const int path_len = strlen(path);
    if (path_len == 0) {
        printf("Warning: Empty path for creating directory");
        return -1;
    }

    char* dirpath = new char[path_len + 1];
    strcpy(dirpath, path);
    errno = 0;

    /* iterate from outer upto inner dir */
    for (char* p = dirpath + 1; *p; p++) {
        if (*p == '/') {
            /* temporarily truncate to sub-dir */
            *p = '\0';

            if (mkdir(dirpath, S_IRWXU) != 0) {
                if (errno != EEXIST)
                    return -1;
            }
            *p = '/';
        }
    }

    if (mkdir(dirpath, S_IRWXU) != 0) {
        if (errno != EEXIST) {
            return -1;
        }
    }

    delete[] dirpath;
    return 0;
}


================================================
FILE: coreneuron/io/file_utils.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

/**
 * @file file_utils.h
 * @brief Utility functions for file/directory management
 *
 */

#pragma once

/** @brief Creates directory if doesn't exisit (similar to mkdir -p)
 *  @param Directory path
 *  @return Status
 */
int mkdir_p(const char* path);


================================================
FILE: coreneuron/io/global_vars.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#include <cstdio>
#include <cstring>
#include <map>
#include <string>
#include <algorithm>

#include "coreneuron/utils/randoms/nrnran123.h"
#include "coreneuron/nrnconf.h"
#include "coreneuron/mechanism/membfunc.hpp"
#include "coreneuron/utils/nrn_assert.h"
#include "coreneuron/io/nrn2core_direct.h"
#include "coreneuron/utils/nrnoc_aux.hpp"

void* (*nrn2core_get_global_dbl_item_)(void*, const char*& name, int& size, double*& val);
int (*nrn2core_get_global_int_item_)(const char* name);

namespace coreneuron {
using PSD = std::pair<std::size_t, double*>;
using N2V = std::map<std::string, PSD>;

static N2V* n2v;

void hoc_register_var(DoubScal* ds, DoubVec* dv, VoidFunc*) {
    if (!n2v) {
        n2v = new N2V();
    }
    for (size_t i = 0; ds[i].name; ++i) {
        (*n2v)[ds[i].name] = PSD(0, ds[i].pdoub);
    }
    for (size_t i = 0; dv[i].name; ++i) {
        (*n2v)[dv[i].name] = PSD(dv[i].index1, ds[i].pdoub);
    }
}

void set_globals(const char* path, bool cli_global_seed, int cli_global_seed_value) {
    if (!n2v) {
        n2v = new N2V();
    }
    (*n2v)["celsius"] = PSD(0, &celsius);
    (*n2v)["dt"] = PSD(0, &dt);
    (*n2v)["t"] = PSD(0, &t);
    (*n2v)["PI"] = PSD(0, &pi);

    if (corenrn_embedded) {  // CoreNEURON embedded, get info direct from NEURON

        const char* name;
        int size;
        double* val = nullptr;
        void* p = nullptr;
        while (1) {
            p = (*nrn2core_get_global_dbl_item_)(p, name, size, val);
            // If the last item in the NEURON symbol table is a USERDOUBLE
            // then p is NULL but val is not NULL and following fragment
            // will be processed before exit from loop.
            if (val) {
                N2V::iterator it = n2v->find(name);
                if (it != n2v->end()) {
                    if (size == 0) {
                        nrn_assert(it->second.first == 0);
                        *(it->second.second) = val[0];
                    } else {
                        nrn_assert(it->second.first == (size_t) size);
                        double* pval = it->second.second;
                        for (int i = 0; i < size; ++i) {
                            pval[i] = val[i];
                        }
                    }
                }
                delete[] val;
                val = nullptr;
            }
            if (!p) {
                break;
            }
        }
        secondorder = (*nrn2core_get_global_int_item_)("secondorder");
        nrnran123_set_globalindex((*nrn2core_get_global_int_item_)("Random123_global_index"));

    } else {  // get the info from the globals.dat file
        std::string fname = std::string(path) + std::string("/globals.dat");
        FILE* f = fopen(fname.c_str(), "r");
        if (!f) {
            printf("ignore: could not open %s\n", fname.c_str());
            delete n2v;
            n2v = nullptr;
            return;
        }

        char line[256];

        nrn_assert(fscanf(f, "%s\n", line) == 1);
        check_bbcore_write_version(line);

        for (;;) {
            char name[256];
            double val;
            int n;
            nrn_assert(fgets(line, 256, f) != nullptr);
            N2V::iterator it;
            if (sscanf(line, "%s %lf", name, &val) == 2) {
                if (strcmp(name, "0") == 0) {
                    break;
                }
                it = n2v->find(name);
                if (it != n2v->end()) {
                    nrn_assert(it->second.first == 0);
                    *(it->second.second) = val;
                }
            } else if (sscanf(line, "%[^[][%d]\n", name, &n) == 2) {
                if (strcmp(name, "0") == 0) {
                    break;
                }
                it = n2v->find(name);
                if (it != n2v->end()) {
                    nrn_assert(it->second.first == (size_t) n);
                    double* pval = it->second.second;
                    for (int i = 0; i < n; ++i) {
                        nrn_assert(fgets(line, 256, f) != nullptr);
                        nrn_assert(sscanf(line, "%lf\n", &val) == 1);
                        pval[i] = val;
                    }
                }
            } else {
                nrn_assert(0);
            }
        }

        while (fgets(line, 256, f)) {
            char name[256];
            int n;
            if (sscanf(line, "%s %d", name, &n) == 2) {
                if (strcmp(name, "secondorder") == 0) {
                    secondorder = n;
                } else if (strcmp(name, "Random123_globalindex") == 0) {
                    nrnran123_set_globalindex((uint32_t) n);
                } else if (strcmp(name, "_nrnunit_use_legacy_") == 0) {
                    if (n != CORENEURON_USE_LEGACY_UNITS) {
                        hoc_execerror(
                            "CORENRN_ENABLE_LEGACY_UNITS not"
                            " consistent with NEURON value of"
                            " nrnunit_use_legacy()",
                            nullptr);
                    }
                }
            }
        }

        fclose(f);

        // overwrite global.dat config if seed is specified on Command line
        if (cli_global_seed) {
            nrnran123_set_globalindex((uint32_t) cli_global_seed_value);
        }
    }

#if CORENRN_DEBUG
    for (const auto& item: *n2v) {
        printf("%s %ld %p\n", item.first.c_str(), item.second.first, item.second.second);
    }
#endif

    delete n2v;
    n2v = nullptr;
}

}  // namespace coreneuron


================================================
FILE: coreneuron/io/lfp.cpp
================================================
#include "coreneuron/io/lfp.hpp"
#include "coreneuron/apps/corenrn_parameters.hpp"

#include <cmath>
#include <limits>
#include <sstream>


namespace coreneuron {
namespace lfputils {

double line_source_lfp_factor(const Point3D& e_pos,
                              const Point3D& seg_0,
                              const Point3D& seg_1,
                              const double radius,
                              const double f) {
    nrn_assert(radius >= 0.0);
    Point3D dx = paxpy(seg_1, -1.0, seg_0);
    Point3D de = paxpy(e_pos, -1.0, seg_0);
    double dx2(dot(dx, dx));
    double dxn(std::sqrt(dx2));
    if (dxn < std::numeric_limits<double>::epsilon()) {
        return point_source_lfp_factor(e_pos, seg_0, radius, f);
    }
    double de2(dot(de, de));
    double mu(dot(dx, de) / dx2);
    Point3D de_star(paxpy(de, -mu, dx));
    double de_star2(dot(de_star, de_star));
    double q2(de_star2 / dx2);

    double delta(mu * mu - (de2 - radius * radius) / dx2);
    double one_m_mu(1.0 - mu);
    auto log_integral = [&q2, &dxn](double a, double b) {
        if (q2 < std::numeric_limits<double>::epsilon()) {
            if (a * b <= 0) {
                std::ostringstream s;
                s << "Log integral: invalid arguments " << b << " " << a
                  << ". Likely electrode exactly on the segment and "
                  << "no flooring is present.";
                throw std::invalid_argument(s.str());
            }
            return std::abs(std::log(b / a)) / dxn;
        } else {
            return std::log((b + std::sqrt(b * b + q2)) / (a + std::sqrt(a * a + q2))) / dxn;
        }
    };
    if (delta <= 0.0) {
        return f * log_integral(-mu, one_m_mu);
    } else {
        double sqr_delta(std::sqrt(delta));
        double d1(mu - sqr_delta);
        double d2(mu + sqr_delta);
        double parts = 0.0;
        if (d1 > 0.0) {
            double b(std::min(d1, 1.0) - mu);
            parts += log_integral(-mu, b);
        }
        if (d2 < 1.0) {
            double b(std::max(d2, 0.0) - mu);
            parts += log_integral(b, one_m_mu);
        };
        // complement
        double maxd1_0(std::max(d1, 0.0)), mind2_1(std::min(d2, 1.0));
        if (maxd1_0 < mind2_1) {
            parts += 1.0 / radius * (mind2_1 - maxd1_0);
        }
        return f * parts;
    };
}
}  // namespace lfputils

using namespace lfputils;

template <LFPCalculatorType Type, typename SegmentIdTy>
LFPCalculator<Type, SegmentIdTy>::LFPCalculator(const Point3Ds& seg_start,
                                                const Point3Ds& seg_end,
                                                const std::vector<double>& radius,
                                                const std::vector<SegmentIdTy>& segment_ids,
                                                const Point3Ds& electrodes,
                                                double extra_cellular_conductivity)
    : segment_ids_(segment_ids) {
    if (seg_start.size() != seg_end.size()) {
        throw std::invalid_argument("Different number of segment starts and ends.");
    }
    if (seg_start.size() != radius.size()) {
        throw std::invalid_argument("Different number of segments and radii.");
    }
    double f(1.0 / (extra_cellular_conductivity * 4.0 * pi));

    m.resize(electrodes.size());
    for (size_t k = 0; k < electrodes.size(); ++k) {
        auto& ms = m[k];
        ms.resize(seg_start.size());
        for (size_t l = 0; l < seg_start.size(); l++) {
            ms[l] = getFactor(electrodes[k], seg_start[l], seg_end[l], radius[l], f);
        }
    }
}

template <LFPCalculatorType Type, typename SegmentIdTy>
template <typename Vector>
inline void LFPCalculator<Type, SegmentIdTy>::lfp(const Vector& membrane_current) {
    std::vector<double> res(m.size());
    for (size_t k = 0; k < m.size(); ++k) {
        res[k] = 0.0;
        auto& ms = m[k];
        for (size_t l = 0; l < ms.size(); l++) {
            res[k] += ms[l] * membrane_current[segment_ids_[l]];
        }
    }
#if NRNMPI
    if (corenrn_param.mpi_enable) {
        lfp_values_.resize(res.size());
        int mpi_sum{1};
        nrnmpi_dbl_allreduce_vec(res.data(), lfp_values_.data(), res.size(), mpi_sum);
    } else
#endif
    {
        std::swap(res, lfp_values_);
    }
}


template LFPCalculator<LineSource>::LFPCalculator(const lfputils::Point3Ds& seg_start,
                                                  const lfputils::Point3Ds& seg_end,
                                                  const std::vector<double>& radius,
                                                  const std::vector<int>& segment_ids,
                                                  const lfputils::Point3Ds& electrodes,
                                                  double extra_cellular_conductivity);
template LFPCalculator<PointSource>::LFPCalculator(const lfputils::Point3Ds& seg_start,
                                                   const lfputils::Point3Ds& seg_end,
                                                   const std::vector<double>& radius,
                                                   const std::vector<int>& segment_ids,
                                                   const lfputils::Point3Ds& electrodes,
                                                   double extra_cellular_conductivity);
template void LFPCalculator<LineSource>::lfp(const DoublePtr& membrane_current);
template void LFPCalculator<PointSource>::lfp(const DoublePtr& membrane_current);
template void LFPCalculator<LineSource>::lfp(const std::vector<double>& membrane_current);
template void LFPCalculator<PointSource>::lfp(const std::vector<double>& membrane_current);

}  // namespace coreneuron


================================================
FILE: coreneuron/io/lfp.hpp
================================================
#pragma once

#include <array>
#include <vector>

#include "coreneuron/mpi/nrnmpi.h"
#include "coreneuron/nrnconf.h"
#include "coreneuron/utils/nrn_assert.h"

namespace coreneuron {

namespace lfputils {

using Point3D = std::array<double, 3>;
using Point3Ds = std::vector<Point3D>;
using DoublePtr = double*;

inline double dot(const Point3D& p1, const Point3D& p2) {
    return p1[0] * p2[0] + p1[1] * p2[1] + p1[2] * p2[2];
}

inline double norm(const Point3D& p1) {
    return std::sqrt(dot(p1, p1));
}

inline Point3D barycenter(const Point3D& p1, const Point3D& p2) {
    return {0.5 * (p1[0] + p2[0]), 0.5 * (p1[1] + p2[1]), 0.5 * (p1[2] + p2[2])};
}

inline Point3D paxpy(const Point3D& p1, const double alpha, const Point3D& p2) {
    return {p1[0] + alpha * p2[0], p1[1] + alpha * p2[1], p1[2] + alpha * p2[2]};
}

/**
 *
 * \param e_pos electrode position
 * \param seg_pos segment position
 * \param radius segment radius
 * \param double conductivity factor 1/([4 pi] * [conductivity])
 * \return Resistance of the medium from the segment to the electrode.
 */
inline double point_source_lfp_factor(const Point3D& e_pos,
                                      const Point3D& seg_pos,
                                      const double radius,
                                      const double f) {
    nrn_assert(radius >= 0.0);
    Point3D es = paxpy(e_pos, -1.0, seg_pos);
    return f / std::max(norm(es), radius);
}

/**
 *
 * \param e_pos electrode position
 * \param seg_pos segment position
 * \param radius segment radius
 * \param f conductivity factor 1/([4 pi] * [conductivity])
 * \return Resistance of the medium from the segment to the electrode.
 */
double line_source_lfp_factor(const Point3D& e_pos,
                              const Point3D& seg_0,
                              const Point3D& seg_1,
                              const double radius,
                              const double f);
}  // namespace lfputils

enum LFPCalculatorType { LineSource, PointSource };

/**
 * \brief LFPCalculator allows calculation of LFP given membrane currents.
 */
template <LFPCalculatorType Ty, typename SegmentIdTy = int>
struct LFPCalculator {
    /**
     * LFP Calculator constructor
     * \param seg_start all segments start owned by the proc
     * \param seg_end all segments end owned by the proc
     * \param radius fence around the segment. Ensures electrode cannot be
     * arbitrarily close to the segment
     * \param electrodes positions of the electrodes
     * \param extra_cellular_conductivity conductivity of the extra-cellular
     * medium
     */
    LFPCalculator(const lfputils::Point3Ds& seg_start,
                  const lfputils::Point3Ds& seg_end,
                  const std::vector<double>& radius,
                  const std::vector<SegmentIdTy>& segment_ids,
                  const lfputils::Point3Ds& electrodes,
                  double extra_cellular_conductivity);

    template <typename Vector>
    void lfp(const Vector& membrane_current);

    const std::vector<double>& lfp_values() const noexcept {
        return lfp_values_;
    }

  private:
    inline double getFactor(const lfputils::Point3D& e_pos,
                            const lfputils::Point3D& seg_0,
                            const lfputils::Point3D& seg_1,
                            const double radius,
                            const double f) const;
    std::vector<double> lfp_values_;
    std::vector<std::vector<double>> m;
    const std::vector<SegmentIdTy>& segment_ids_;
};

template <>
double LFPCalculator<LineSource>::getFactor(const lfputils::Point3D& e_pos,
                                            const lfputils::Point3D& seg_0,
                                            const lfputils::Point3D& seg_1,
                                            const double radius,
                                            const double f) const {
    return lfputils::line_source_lfp_factor(e_pos, seg_0, seg_1, radius, f);
}

template <>
double LFPCalculator<PointSource>::getFactor(const lfputils::Point3D& e_pos,
                                             const lfputils::Point3D& seg_0,
                                             const lfputils::Point3D& seg_1,
                                             const double radius,
                                             const double f) const {
    return lfputils::point_source_lfp_factor(e_pos, lfputils::barycenter(seg_0, seg_1), radius, f);
}

extern template LFPCalculator<LineSource>::LFPCalculator(const lfputils::Point3Ds& seg_start,
                                                         const lfputils::Point3Ds& seg_end,
                                                         const std::vector<double>& radius,
                                                         const std::vector<int>& segment_ids,
                                                         const lfputils::Point3Ds& electrodes,
                                                         double extra_cellular_conductivity);
extern template LFPCalculator<PointSource>::LFPCalculator(const lfputils::Point3Ds& seg_start,
                                                          const lfputils::Point3Ds& seg_end,
                                                          const std::vector<double>& radius,
                                                          const std::vector<int>& segment_ids,
                                                          const lfputils::Point3Ds& electrodes,
                                                          double extra_cellular_conductivity);
extern template void LFPCalculator<LineSource>::lfp(const lfputils::DoublePtr& membrane_current);
extern template void LFPCalculator<PointSource>::lfp(const lfputils::DoublePtr& membrane_current);
extern template void LFPCalculator<LineSource>::lfp(const std::vector<double>& membrane_current);
extern template void LFPCalculator<PointSource>::lfp(const std::vector<double>& membrane_current);
}  // namespace coreneuron


================================================
FILE: coreneuron/io/mech_report.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#include <iostream>
#include <vector>

#include "coreneuron/coreneuron.hpp"
#include "coreneuron/io/nrn_setup.hpp"
#include "coreneuron/mpi/nrnmpi.h"
#include "coreneuron/apps/corenrn_parameters.hpp"

namespace coreneuron {
/** display global mechanism count */
void write_mech_report() {
    /// mechanim count across all gids, local to rank
    const auto n_memb_func = corenrn.get_memb_funcs().size();
    std::vector<long> local_mech_count(n_memb_func, 0);
    std::vector<long> local_mech_size(n_memb_func, 0);

    /// each gid record goes on separate row, only check non-empty threads
    for (int i = 0; i < nrn_nthread; i++) {
        const auto& nt = nrn_threads[i];
        for (auto* tml = nt.tml; tml; tml = tml->next) {
            const int type = tml->index;
            const auto& ml = tml->ml;
            local_mech_count[type] += ml->nodecount;
            local_mech_size[type] = memb_list_size(tml, true);
        }
    }

    std::vector<long> total_mech_count(n_memb_func);
    std::vector<long> total_mech_size(n_memb_func);

#if NRNMPI
    if (corenrn_param.mpi_enable) {
        /// get global sum of all mechanism instances
        nrnmpi_long_allreduce_vec(&local_mech_count[0],
                                  &total_mech_count[0],
                                  local_mech_count.size(),
                                  1);
        nrnmpi_long_allreduce_vec(&local_mech_size[0],
                                  &total_mech_size[0],
                                  local_mech_size.size(),
                                  1);
    } else
#endif
    {
        total_mech_count = local_mech_count;
        total_mech_size = local_mech_size;
    }

    /// print global stats to stdout
    if (nrnmpi_myid == 0) {
        printf("\n============== MECHANISMS COUNT AND SIZE BY TYPE =============\n");
        printf("%4s %20s %10s %25s\n", "Id", "Name", "Count", "Total memory size (KiB)");
        for (size_t i = 0; i < total_mech_count.size(); i++) {
            if (total_mech_count[i] > 0) {
                printf("%4lu %20s %10ld %25.2lf\n",
                       i,
                       nrn_get_mechname(i),
                       total_mech_count[i],
                       static_cast<double>(total_mech_size[i]) / 1024);
            }
        }
        printf("==============================================================\n");
    }
}

}  // namespace coreneuron


================================================
FILE: coreneuron/io/mech_report.h
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#pragma once

#include <string>

namespace coreneuron {
/// write mechanism counts to stdout
void write_mech_report();
}  // namespace coreneuron


================================================
FILE: coreneuron/io/mem_layout_util.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#include "mem_layout_util.hpp"

namespace coreneuron {

/// calculate size after padding for specific memory layout
// Warning: this function is declared extern in nrniv_decl.h
int nrn_soa_padded_size(int cnt, int layout) {
    return soa_padded_size<NRN_SOA_PAD>(cnt, layout);
}

/// return the new offset considering the byte aligment settings
size_t nrn_soa_byte_align(size_t size) {
    static_assert(NRN_SOA_BYTE_ALIGN % sizeof(double) == 0,
                  "NRN_SOA_BYTE_ALIGN should be a multiple of sizeof(double)");
    constexpr size_t dbl_align{NRN_SOA_BYTE_ALIGN / sizeof(double)};
    size_t remainder{size % dbl_align};
    if (remainder) {
        size += dbl_align - remainder;
    }
    nrn_assert((size * sizeof(double)) % NRN_SOA_BYTE_ALIGN == 0);
    return size;
}

int nrn_i_layout(int icnt, int cnt, int isz, int sz, int layout) {
    switch (layout) {
        case Layout::AoS:
            return icnt * sz + isz;
        case Layout::SoA:
            int padded_cnt = nrn_soa_padded_size(cnt,
                                                 layout);  // may want to factor out to save time
            return icnt + isz * padded_cnt;
    }

    nrn_assert(false);
    return 0;
}

// file data is AoS. ie.
// organized as cnt array instances of mtype each of size sz.
// So input index i refers to i_instance*sz + i_item offset
// Return the corresponding SoA index -- taking into account the
// alignment requirements. Ie. i_instance + i_item*align_cnt.

int nrn_param_layout(int i, int mtype, Memb_list* ml) {
    int layout = corenrn.get_mech_data_layout()[mtype];
    switch (layout) {
        case Layout::AoS:
            return i;
        case Layout::SoA:
            nrn_assert(layout == Layout::SoA);
            int sz = corenrn.get_prop_param_size()[mtype];
            return nrn_i_layout(i / sz, ml->nodecount, i % sz, sz, layout);
    }
    nrn_assert(false);
    return 0;
}
}  // namespace coreneuron


================================================
FILE: coreneuron/io/mem_layout_util.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#pragma once

#include "coreneuron/coreneuron.hpp"
#include "coreneuron/nrniv/nrniv_decl.h"

namespace coreneuron {

#if !defined(NRN_SOA_PAD)
// for layout 0, every range variable array must have a size which
// is a multiple of NRN_SOA_PAD doubles
#define NRN_SOA_PAD 8
#endif

/// return the new offset considering the byte aligment settings
size_t nrn_soa_byte_align(size_t i);

/// This function return the index in a flat array of a matrix coordinate (icnt, isz).
/// The matrix size is (cnt, sz)
/// Depending of the layout some padding can be calculated
int nrn_i_layout(int icnt, int cnt, int isz, int sz, int layout);

// file data is AoS. ie.
// organized as cnt array instances of mtype each of size sz.
// So input index i refers to i_instance*sz + i_item offset
// Return the corresponding SoA index -- taking into account the
// alignment requirements. Ie. i_instance + i_item*align_cnt.

int nrn_param_layout(int i, int mtype, Memb_list* ml);
}  // namespace coreneuron


================================================
FILE: coreneuron/io/mk_mech.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#include <cstring>
#include <map>
#include <iostream>
#include <fstream>
#include <sstream>

#include "coreneuron/nrnconf.h"
#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/membrane_definitions.h"
#include "coreneuron/mechanism/register_mech.hpp"
#include "coreneuron/nrniv/nrniv_decl.h"
#include "coreneuron/utils/nrn_assert.h"
#include "coreneuron/mechanism/mech/cfile/cabvars.h"
#include "coreneuron/io/nrn2core_direct.h"
#include "coreneuron/coreneuron.hpp"
#include "coreneuron/mechanism//eion.hpp"

static char banner[] = "Duke, Yale, and the BlueBrain Project -- Copyright 1984-2020";

namespace coreneuron {
extern int nrn_nobanner_;

// NB: this should go away
extern std::string cnrn_version();
std::map<std::string, int> mech2type;

extern "C" {
void (*nrn2core_mkmech_info_)(std::ostream&);
}
static void mk_mech();
static void mk_mech(std::istream&);

/// Read meta data about the mechanisms and allocate corresponding mechanism management data
/// structures
void mk_mech(const char* datpath) {
    if (corenrn_embedded) {
        // we are embedded in NEURON
        mk_mech();
        return;
    }
    {
        std::string fname = std::string(datpath) + "/bbcore_mech.dat";
        std::ifstream fs(fname);

        if (!fs.good()) {
            fprintf(stderr,
                    "Error: couldn't find bbcore_mech.dat file in the dataset directory \n");
            fprintf(stderr,
                    "       Make sure to pass full directory path of dataset using -d DIR or "
                    "--datpath=DIR \n");
        }

        nrn_assert(fs.good());
        mk_mech(fs);
        fs.close();
    }
}

// we are embedded in NEURON, get info as stringstream from nrnbbcore_write.cpp
static void mk_mech() {
    static bool already_called = false;
    if (already_called) {
        return;
    }
    std::stringstream ss;
    nrn_assert(nrn2core_mkmech_info_);
    (*nrn2core_mkmech_info_)(ss);
    mk_mech(ss);
    already_called = true;
}

static void mk_mech(std::istream& s) {
    char version[256];
    s >> version;
    check_bbcore_write_version(version);

    //  printf("reading %s\n", fname);
    int n = 0;
    nrn_assert(s >> n);

    /// Allocate space for mechanism related data structures
    alloc_mech(n);

    /// Read all the mechanisms and their meta data
    for (int i = 2; i < n; ++i) {
        char mname[100];
        int type = 0, pnttype = 0, is_art = 0, is_ion = 0, dsize = 0, pdsize = 0;
        nrn_assert(s >> mname >> type >> pnttype >> is_art >> is_ion >> dsize >> pdsize);
        nrn_assert(i == type);
#ifdef DEBUG
        printf("%s %d %d %d %d %d %d\n", mname, type, pnttype, is_art, is_ion, dsize, pdsize);
#endif
        std::string str(mname);
        corenrn.get_memb_func(type).sym = (Symbol*) strdup(mname);
        mech2type[str] = type;
        corenrn.get_pnt_map()[type] = (char) pnttype;
        corenrn.get_prop_param_size()[type] = dsize;
        corenrn.get_prop_dparam_size()[type] = pdsize;
        corenrn.get_is_artificial()[type] = is_art;
        if (is_ion) {
            double charge = 0.;
            nrn_assert(s >> charge);
            // strip the _ion
            char iname[100];
            strcpy(iname, mname);
            iname[strlen(iname) - 4] = '\0';
            // printf("%s %s\n", mname, iname);
            ion_reg(iname, charge);
        }
        // printf("%s %d %d\n", mname, nrn_get_mechtype(mname), type);
    }

    if (nrnmpi_myid < 1 && nrn_nobanner_ == 0) {
        fprintf(stderr, " \n");
        fprintf(stderr, " %s\n", banner);
        fprintf(stderr, " Version : %s\n", cnrn_version().c_str());
        fprintf(stderr, " \n");
        fflush(stderr);
    }
    /* will have to put this back if any mod file refers to diam */
    //	register_mech(morph_mech, morph_alloc, (Pfri)0, (Pfri)0, (Pfri)0, (Pfri)0, -1, 0);

    /// Calling _reg functions for the default mechanisms from the file mech/cfile/cabvars.h
    for (int i = 0; mechanism[i]; i++) {
        (*mechanism[i])();
    }
}

/// Get mechanism type by the mechanism name
int nrn_get_mechtype(const char* name) {
    auto mapit = mech2type.find(name);
    if (mapit == mech2type.end())
        return -1;  // Could not find the mechanism
    return mapit->second;
}

const char* nrn_get_mechname(int type) {
    for (const auto& item: mech2type) {
        if (type == item.second) {
            return item.first.c_str();
        }
    }
    return nullptr;
}
}  // namespace coreneuron


================================================
FILE: coreneuron/io/nrn2core_data_init.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/
#include <sstream>

#include "coreneuron/nrnconf.h"
#include "coreneuron/network/netpar.hpp"
#include "coreneuron/network/netcvode.hpp"
#include "coreneuron/sim/fast_imem.hpp"
#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/utils/profile/profiler_interface.h"
#include "coreneuron/coreneuron.hpp"
#include "coreneuron/utils/nrnoc_aux.hpp"
#include "coreneuron/io/mem_layout_util.hpp"  // for WATCH use of nrn_i_layout
#include "coreneuron/utils/vrecitem.h"
#include "coreneuron/io/core2nrn_data_return.hpp"

namespace coreneuron {

// helper functions defined below.
static void nrn2core_tqueue();
static void watch_activate_clear();
static void nrn2core_transfer_watch_condition(int, int, int, int, int);
static void vec_play_activate();
static void nrn2core_patstim_share_info();

extern "C" {
/** Pointer to function in NEURON that iterates over activated
    WATCH statements, sending each item to ...
**/
void (*nrn2core_transfer_watch_)(void (*cb)(int, int, int, int, int));
}

/**
  All state from NEURON necessary to continue a run.

  In NEURON direct mode, we desire the exact behavior of
  ParallelContext.psolve(tstop). I.e. a sequence of such calls with and
  without intervening calls to h.finitialize(). Most state (structure
  and data of the substantive model) has been copied
  from NEURON during nrn_setup. Now we need to copy the event queue
  and set up any other invalid internal structures. I.e basically the
  nrn_finitialize above but without changing any simulation data. We follow
  some of the strategy of checkpoint_initialize.
**/
void direct_mode_initialize() {
    dt2thread(-1.);
    nrn_thread_table_check();

    clear_event_queue();

    // Reproduce present NEURON WATCH activation
    // Start from nothing active.
    watch_activate_clear();
    // nrn2core_transfer_watch_condition(...) receives the WATCH activation info
    // on a per active WatchCondition basis from NEURON.
    (*nrn2core_transfer_watch_)(nrn2core_transfer_watch_condition);

    nrn_spike_exchange_init();

    // the things done by checkpoint restore at the end of Phase2::read_file
    // vec_play_continuous n_vec_play_continuous of them
    // patstim_index
    // preSynConditionEventFlags nt.n_presyn of them
    // restore_events
    // restore_events
    // the things done for checkpoint at the end of Phase2::populate
    // checkpoint_restore_tqueue
    // Lastly, if PatternStim exists, needs initialization
    // checkpoint_restore_patternstim
    // io/nrn_checkpoint.cpp: write_tqueue contains examples for each
    // DiscreteEvent type with regard to the information needed for each
    // subclass from the point of view of CoreNEURON.
    // E.g. for NetConType_, just netcon_index
    // The trick, then, is to figure out the CoreNEURON info from the
    // NEURON queue items and that should be available in passing from
    // the existing processing of nrncore_write.

    // activate the vec_play_continuous events defined in phase2 setup.
    vec_play_activate();

    // Any PreSyn.flag_ == 1 on the NEURON side needs to be transferred
    // or the PreSyn will spuriously fire when psolve starts.
    extern void nrn2core_PreSyn_flag_receive(int tid);
    for (int tid = 0; tid < nrn_nthread; ++tid) {
        nrn2core_PreSyn_flag_receive(tid);
    }

    nrn2core_patstim_share_info();

    nrn2core_tqueue();
}

void vec_play_activate() {
    for (int tid = 0; tid < nrn_nthread; ++tid) {
        NrnThread* nt = nrn_threads + tid;
        for (int i = 0; i < nt->n_vecplay; ++i) {
            PlayRecord* pr = (PlayRecord*) nt->_vecplay[i];
            assert(pr->type() == VecPlayContinuousType);
            VecPlayContinuous* vpc = (VecPlayContinuous*) pr;
            assert(vpc->e_);
            assert(vpc->discon_indices_ == nullptr);  // not implemented
            vpc->e_->send(vpc->t_[vpc->ubound_index_], net_cvode_instance, nt);
        }
    }
}

}  // namespace coreneuron

// For direct transfer of event queue information
// Must be the same as corresponding struct NrnCoreTransferEvents in NEURON
// Do not put this coreneuron version in the coreneuron namespace so that the
// function pointer/callback has the same type in both NEURON and CoreNEURON.
// Calling a function through a pointer to a function of different type is
// undefined behaviour.
struct NrnCoreTransferEvents {
    std::vector<int> type;        // DiscreteEvent type
    std::vector<double> td;       // delivery time
    std::vector<int> intdata;     // ints specific to the DiscreteEvent type
    std::vector<double> dbldata;  // doubles specific to the type.
};

namespace coreneuron {

extern "C" {
/** Pointer to function in NEURON that iterates over its tqeueue **/
NrnCoreTransferEvents* (*nrn2core_transfer_tqueue_)(int tid);
}

// for faster determination of the movable index given the type
static std::unordered_map<int, int> type2movable;
static void setup_type2semantics() {
    if (type2movable.empty()) {
        std::size_t const n_memb_func{corenrn.get_memb_funcs().size()};
        for (std::size_t type = 0; type < n_memb_func; ++type) {
            int* ds{corenrn.get_memb_func(type).dparam_semantics};
            if (ds) {
                int dparam_size = corenrn.get_prop_dparam_size()[type];
                for (int psz = 0; psz < dparam_size; ++psz) {
                    if (ds[psz] == -4) {  // netsend semantics
                        type2movable[type] = psz;
                    }
                }
            }
        }
    }
}

/** Copy each thread's queue from NEURON **/
static void nrn2core_tqueue() {
    setup_type2semantics();                        // need type2movable for SelfEvent.
    for (int tid = 0; tid < nrn_nthread; ++tid) {  // should be parallel
        NrnCoreTransferEvents* ncte = (*nrn2core_transfer_tqueue_)(tid);
        if (ncte) {
            size_t idat = 0;
            size_t idbldat = 0;
            NrnThread& nt = nrn_threads[tid];
            for (size_t i = 0; i < ncte->type.size(); ++i) {
                switch (ncte->type[i]) {
                    case 0: {  // DiscreteEvent
                               // Ignore
                    } break;

                    case 2: {  // NetCon
                        int ncindex = ncte->intdata[idat++];
                        NetCon* nc = nt.netcons + ncindex;
#ifndef CORENRN_DEBUG_QUEUE
#define CORENRN_DEBUG_QUEUE 0
#endif
#if CORENRN_DEBUG_QUEUE
                        printf("nrn2core_tqueue tid=%d i=%zd type=%d tdeliver=%g NetCon %d\n",
                               tid,
                               i,
                               ncte->type[i],
                               ncte->td[i],
                               ncindex);
#endif
                        nc->send(ncte->td[i], net_cvode_instance, &nt);
                    } break;

                    case 3: {  // SelfEvent
                        // target_type, target_instance, weight_index, flag movable

                        // This is a nightmare and needs to be profoundly re-imagined.

                        // Determine Point_process*
                        int target_type = ncte->intdata[idat++];
                        int target_instance = ncte->intdata[idat++];
                        // From target_type and target_instance (mechanism data index)
                        // compute the nt.pntprocs index.
                        int offset = nt._pnt_offset[target_type];
                        Point_process* pnt = nt.pntprocs + offset + target_instance;
                        assert(pnt->_type == target_type);
                        Memb_list* ml = nt._ml_list[target_type];
                        if (ml->_permute) {
                            target_instance = ml->_permute[target_instance];
                        }
                        assert(pnt->_i_instance == target_instance);
                        assert(pnt->_tid == tid);

                        // Determine weight_index
                        int netcon_index = ncte->intdata[idat++];  // via the NetCon
                        int weight_index = -1;                     // no associated netcon
                        if (netcon_index >= 0) {
                            weight_index = nt.netcons[netcon_index].u.weight_index_;
                        }

                        double flag = ncte->dbldata[idbldat++];
                        int is_movable = ncte->intdata[idat++];
                        // If the queue item is movable, then the pointer needs to be
                        // stored in the mechanism instance movable slot by net_send.
                        // And don't overwrite if not movable. Only one SelfEvent
                        // for a given target instance is movable.
                        int movable_index =
                            nrn_i_layout(target_instance,
                                         ml->nodecount,
                                         type2movable[target_type],
                                         corenrn.get_prop_dparam_size()[target_type],
                                         corenrn.get_mech_data_layout()[target_type]);
                        void** movable_arg = nt._vdata + ml->pdata[movable_index];
                        TQItem* old_movable_arg = (TQItem*) (*movable_arg);
#if CORENRN_DEBUG_QUEUE
                        printf("nrn2core_tqueue tid=%d i=%zd type=%d tdeliver=%g SelfEvent\n",
                               tid,
                               i,
                               ncte->type[i],
                               ncte->td[i]);
                        printf(
                            "  target_type=%d pnt data index=%d flag=%g is_movable=%d netcon index "
                            "for weight=%d\n",
                            target_type,
                            target_instance,
                            flag,
                            is_movable,
                            netcon_index);
#endif
                        net_send(movable_arg, weight_index, pnt, ncte->td[i], flag);
                        if (!is_movable) {
                            *movable_arg = (void*) old_movable_arg;
                        }
                    } break;

                    case 4: {  // PreSyn
                        int type = ncte->intdata[idat++];
                        if (type == 0) {  // CoreNEURON PreSyn
                            int ps_index = ncte->intdata[idat++];
#if CORENRN_DEBUG_QUEUE
                            printf("nrn2core_tqueue tid=%d i=%zd type=%d tdeliver=%g PreSyn %d\n",
                                   tid,
                                   i,
                                   ncte->type[i],
                                   ncte->td[i],
                                   ps_index);
#endif
                            PreSyn* ps = nt.presyns + ps_index;
                            int gid = ps->output_index_;
                            // Following assumes already sent to other machines.
                            ps->output_index_ = -1;
                            ps->send(ncte->td[i], net_cvode_instance, &nt);
                            ps->output_index_ = gid;
                        } else {  // CoreNEURON InputPreSyn
                            int gid = ncte->intdata[idat++];
                            InputPreSyn* ps = gid2in[gid];
                            ps->send(ncte->td[i], net_cvode_instance, &nt);
                        }
                    } break;

                    case 6: {  // PlayRecordEvent
                               // Ignore as phase2 handles analogous to checkpoint restore.
                    } break;

                    case 7: {  // NetParEvent
#if CORENRN_DEBUG_QUEUE
                        printf("nrn2core_tqueue tid=%d i=%zd type=%d tdeliver=%g NetParEvent\n",
                               tid,
                               i,
                               ncte->type[i],
                               ncte->td[i]);
#endif
                    } break;

                    default: {
                        std::stringstream qetype;
                        qetype << ncte->type[i];
                        hoc_execerror("Unimplemented transfer queue event type:",
                                      qetype.str().c_str());
                    } break;
                }
            }
            delete ncte;
        }
    }
}

/** @brief return first and last datum indices of WATCH statements
 */
void watch_datum_indices(int type, int& first, int& last) {
    int* semantics = corenrn.get_memb_func(type).dparam_semantics;
    int dparam_size = corenrn.get_prop_dparam_size()[type];
    // which slots are WATCH
    // Note that first is the WatchList item, not the WatchCondition
    first = -1;
    last = 0;
    for (int i = 0; i < dparam_size; ++i) {
        if (semantics[i] == -8) {  // WATCH
            if (first == -1) {
                first = i;
            }
            last = i;
        }
    }
}

void watch_activate_clear() {
    // Can identify mechanisms with WATCH statements from non-NULL
    // corenrn.get_watch_check()[type] and figure out pdata that are
    // _watch_array items from corenrn.get_memb_func(type).dparam_semantics
    // Ironically, all WATCH statements may already be inactivated in
    // consequence of phase2 transfer. But, for direct mode psolve, we would
    // eventually like to minimise that transfer (at least with respect to
    // structure).

    // Loop over threads, mechanisms and pick out the ones with WATCH statements.
    for (int tid = 0; tid < nrn_nthread; ++tid) {
        NrnThread& nt = nrn_threads[tid];
        for (NrnThreadMembList* tml = nt.tml; tml; tml = tml->next) {
            if (corenrn.get_watch_check()[tml->index]) {
                // zero all the WATCH slots.
                Memb_list* ml = tml->ml;
                int type = tml->index;
                int dparam_size = corenrn.get_prop_dparam_size()[type];
                // which slots are WATCH
                int first, last;
                watch_datum_indices(type, first, last);
                // Zero the _watch_array from first to last inclusive.
                // Note: the first is actually unused but is there because NEURON
                // uses it. There is probably a better way to do this.
                int* pdata = ml->pdata;
                int nodecount = ml->nodecount;
                int layout = corenrn.get_mech_data_layout()[type];
                for (int iml = 0; iml < nodecount; ++iml) {
                    for (int i = first; i <= last; ++i) {
                        int* pd = pdata + nrn_i_layout(iml, nodecount, i, dparam_size, layout);
                        *pd = 0;
                    }
                }
            }
        }
    }
}

void nrn2core_transfer_watch_condition(int tid,
                                       int pnttype,
                                       int pntindex,
                                       int watch_index,
                                       int triggered) {
    // Note: watch_index relative to AoS _ppvar for instance.
    NrnThread& nt = nrn_threads[tid];
    int pntoffset = nt._pnt_offset[pnttype];
    Point_process* pnt = nt.pntprocs + (pntoffset + pntindex);
    assert(pnt->_type == pnttype);
    Memb_list* ml = nt._ml_list[pnttype];
    if (ml->_permute) {
        pntindex = ml->_permute[pntindex];
    }
    assert(pnt->_i_instance == pntindex);
    assert(pnt->_tid == tid);

    // perhaps all this should be more closely associated with phase2 since
    // we are really talking about (direct) transfer from NEURON and not able
    // to rely on finitialize() on the CoreNEURON side which would otherwise
    // set up all this stuff as a consequence of SelfEvents initiated
    // and delivered at time 0.
    // I've become shakey in regard to how this is done since the reorganization
    // from where everything was done in nrn_setup.cpp. Here, I'm guessing
    // nrn_i_layout is the relevant index transformation after finding the
    // beginning of the mechanism pdata.
    int* pdata = ml->pdata;
    int iml = pntindex;
    int nodecount = ml->nodecount;
    int i = watch_index;
    int dparam_size = corenrn.get_prop_dparam_size()[pnttype];
    int layout = corenrn.get_mech_data_layout()[pnttype];
    int* pd = pdata + nrn_i_layout(iml, nodecount, i, dparam_size, layout);

    // activate the WatchCondition
    *pd = 2 + triggered;
}

// PatternStim direct mode
// NEURON and CoreNEURON had different definitions for struct Info but
// the NEURON version of pattern.mod for PatternStim was changed to
// adopt the CoreNEURON version (along with THREADSAFE so they have the
// same param size). So they now both share the same
// instance of Info and NEURON is responsible for constructor/destructor.
// And in direct mode, PatternStim gets no special treatment except that
// on the CoreNEURON side, the Info struct points to the NEURON instance.

// from patstim.mod
extern void** pattern_stim_info_ref(int icnt,
                                    int cnt,
                                    double* _p,
                                    Datum* _ppvar,
                                    ThreadDatum* _thread,
                                    NrnThread* _nt,
                                    Memb_list* ml,
                                    double v);

extern "C" {
void (*nrn2core_patternstim_)(void** info);
}

// In direct mode, CoreNEURON and NEURON share the same PatternStim Info
// Assume singleton for PatternStim but that is not really necessary in principle.
void nrn2core_patstim_share_info() {
    int type = nrn_get_mechtype("PatternStim");
    NrnThread* nt = nrn_threads + 0;
    Memb_list* ml = nt->_ml_list[type];
    if (ml) {
        int layout = corenrn.get_mech_data_layout()[type];
        int sz = corenrn.get_prop_param_size()[type];
        int psz = corenrn.get_prop_dparam_size()[type];
        int _cntml = ml->nodecount;
        assert(ml->nodecount == 1);
        int _iml = 0;  // Assume singleton here and in (*nrn2core_patternstim_)(info) below.
        double* _p = ml->data;
        Datum* _ppvar = ml->pdata;
        if (layout == Layout::AoS) {
            _p += _iml * sz;
            _ppvar += _iml * psz;
        } else if (layout == Layout::SoA) {
            ;
        } else {
            assert(0);
        }

        void** info = pattern_stim_info_ref(_iml, _cntml, _p, _ppvar, nullptr, nt, ml, 0.0);
        (*nrn2core_patternstim_)(info);
    }
}


}  // namespace coreneuron


================================================
FILE: coreneuron/io/nrn2core_direct.h
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#pragma once

#include <iostream>
#include <vector>

extern "C" {
// The callbacks into nrn/src/nrniv/nrnbbcore_write.cpp to get
// data directly instead of via files.

extern bool corenrn_embedded;
extern int corenrn_embedded_nthread;

extern void (*nrn2core_group_ids_)(int*);

extern void (*nrn2core_mkmech_info_)(std::ostream&);

extern void* (*nrn2core_get_global_dbl_item_)(void*, const char*& name, int& size, double*& val);
extern int (*nrn2core_get_global_int_item_)(const char* name);

extern int (*nrn2core_get_dat1_)(int tid,
                                 int& n_presyn,
                                 int& n_netcon,
                                 int*& output_gid,
                                 int*& netcon_srcgid,
                                 std::vector<int>& netcon_negsrcgid_tid);

extern int (*nrn2core_get_dat2_1_)(int tid,
                                   int& n_real_cell,
                                   int& ngid,
                                   int& n_real_gid,
                                   int& nnode,
                                   int& ndiam,
                                   int& nmech,
                                   int*& tml_index,
                                   int*& ml_nodecount,
                                   int& nidata,
                                   int& nvdata,
                                   int& nweight);

extern int (*nrn2core_get_dat2_2_)(int tid,
                                   int*& v_parent_index,
                                   double*& a,
                                   double*& b,
                                   double*& area,
                                   double*& v,
                                   double*& diamvec);

extern int (*nrn2core_get_dat2_mech_)(int tid,
                                      size_t i,
                                      int dsz_inst,
                                      int*& nodeindices,
                                      double*& data,
                                      int*& pdata,
                                      std::vector<int>& pointer2type);

extern int (*nrn2core_get_dat2_3_)(int tid,
                                   int nweight,
                                   int*& output_vindex,
                                   double*& output_threshold,
                                   int*& netcon_pnttype,
                                   int*& netcon_pntindex,
                                   double*& weights,
                                   double*& delays);

extern int (*nrn2core_get_dat2_corepointer_)(int tid, int& n);

extern int (*nrn2core_get_dat2_corepointer_mech_)(int tid,
                                                  int type,
                                                  int& icnt,
                                                  int& dcnt,
                                                  int*& iarray,
                                                  double*& darray);

extern int (*nrn2core_get_dat2_vecplay_)(int tid, std::vector<int>& indices);

extern int (*nrn2core_get_dat2_vecplay_inst_)(int tid,
                                              int i,
                                              int& vptype,
                                              int& mtype,
                                              int& ix,
                                              int& sz,
                                              double*& yvec,
                                              double*& tvec,
                                              int& last_index,
                                              int& discon_index,
                                              int& ubound_index);

extern void (*nrn2core_part2_clean_)();

/* what variables to send back to NEURON on each time step */
extern void (*nrn2core_get_trajectory_requests_)(int tid,
                                                 int& bsize,
                                                 int& n_pr,
                                                 void**& vpr,
                                                 int& n_trajec,
                                                 int*& types,
                                                 int*& indices,
                                                 double**& pvars,
                                                 double**& varrays);

/* send values to NEURON on each time step */
extern void (*nrn2core_trajectory_values_)(int tid, int n_pr, void** vpr, double t);

/* Filled the Vector data arrays and send back the sizes at end of run */
extern void (
    *nrn2core_trajectory_return_)(int tid, int n_pr, int bsize, int vecsz, void** vpr, double t);

/* send all spikes vectors to NEURON */
extern int (*nrn2core_all_spike_vectors_return_)(std::vector<double>& spikevec,
                                                 std::vector<int>& gidvec);

/* send all weights to NEURON */
extern void (*nrn2core_all_weights_return_)(std::vector<double*>& weights);

/* get data array pointer from NEURON to copy into. */
extern size_t (*nrn2core_type_return_)(int type, int tid, double*& data, double**& mdata);
}  // extern "C"


================================================
FILE: coreneuron/io/nrn_checkpoint.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/
#include <iostream>
#include <sstream>
#include <cassert>
#include <memory>

#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/nrniv/nrniv_decl.h"
#include "coreneuron/io/nrn_filehandler.hpp"
#include "coreneuron/io/nrn_checkpoint.hpp"
#include "coreneuron/io/nrn_setup.hpp"
#include "coreneuron/network/netcvode.hpp"
#include "coreneuron/network/netpar.hpp"
#include "coreneuron/utils/vrecitem.h"
#include "coreneuron/mechanism/mech/mod2c_core_thread.hpp"
#include "coreneuron/io/file_utils.hpp"
#include "coreneuron/permute/data_layout.hpp"
#include "coreneuron/permute/node_permute.h"
#include "coreneuron/coreneuron.hpp"
#include "coreneuron/utils/nrnoc_aux.hpp"
#include "coreneuron/apps/corenrn_parameters.hpp"

namespace coreneuron {
// Those functions comes from mod file directly
extern int checkpoint_save_patternstim(_threadargsproto_);
extern void checkpoint_restore_patternstim(int, double, _threadargsproto_);

CheckPoints::CheckPoints(const std::string& save, const std::string& restore)
    : save_(save)
    , restore_(restore)
    , restored(false) {
    if (!save.empty()) {
        if (nrnmpi_myid == 0) {
            mkdir_p(save.c_str());
        }
    }
}

/// todo : need to broadcast this rather than all reading a double
double CheckPoints::restore_time() const {
    if (!should_restore()) {
        return 0.;
    }

    double rtime = 0.;
    FileHandler f;
    std::string filename = restore_ + "/time.dat";
    f.open(filename, std::ios::in);
    f.read_array(&rtime, 1);
    f.close();
    return rtime;
}

void CheckPoints::write_checkpoint(NrnThread* nt, int nb_threads) const {
    if (!should_save()) {
        return;
    }

#if NRNMPI
    if (corenrn_param.mpi_enable) {
        nrnmpi_barrier();
    }
#endif

    /**
     * if openmp threading needed:
     *  #pragma omp parallel for private(i) shared(nt, nb_threads) schedule(runtime)
     */
    for (int i = 0; i < nb_threads; i++) {
        if (nt[i].ncell || nt[i].tml) {
            write_phase2(nt[i]);
        }
    }

    if (nrnmpi_myid == 0) {
        write_time();
    }
#if NRNMPI
    if (corenrn_param.mpi_enable) {
        nrnmpi_barrier();
    }
#endif
}

// Factor out the body of ion handling below as the same code
// handles POINTER
static int nrn_original_aos_index(int etype, int ix, NrnThread& nt, int** ml_pinv) {
    // Determine ei_instance and ei from etype and ix.
    // Deal with existing permutation and SoA.
    Memb_list* eml = nt._ml_list[etype];
    int ecnt = eml->nodecount;
    int esz = corenrn.get_prop_param_size()[etype];
    int elayout = corenrn.get_mech_data_layout()[etype];
    // current index into eml->data is a  function
    // of elayout, eml._permute, ei_instance, ei, and
    // eml padding.
    int p = ix - (eml->data - nt._data);
    assert(p >= 0 && p < eml->_nodecount_padded * esz);
    int ei_instance, ei;
    nrn_inverse_i_layout(p, ei_instance, ecnt, ei, esz, elayout);
    if (elayout == Layout::SoA) {
        if (eml->_permute) {
            if (!ml_pinv[etype]) {
                ml_pinv[etype] = inverse_permute(eml->_permute, eml->nodecount);
            }
            ei_instance = ml_pinv[etype][ei_instance];
        }
    }
    return ei_instance * esz + ei;
}

void CheckPoints::write_phase2(NrnThread& nt) const {
    FileHandler fh;

    NrnThreadChkpnt& ntc = nrnthread_chkpnt[nt.id];
    auto filename = get_save_path() + "/" + std::to_string(ntc.file_id) + "_2.dat";

    fh.open(filename, std::ios::out);
    fh.checkpoint(2);

    int n_outputgid = 0;  // calculate PreSyn with gid >= 0
    for (int i = 0; i < nt.n_presyn; ++i) {
        if (nt.presyns[i].gid_ >= 0) {
            ++n_outputgid;
        }
    }

    fh << nt.ncell << " ncell\n";
    fh << n_outputgid << " ngid\n";
#if CHKPNTDEBUG
    assert(ntc.n_outputgids == n_outputgid);
#endif

    fh << nt.n_real_output << " n_real_output\n";
    fh << nt.end << " nnode\n";
    fh << ((nt._actual_diam == nullptr) ? 0 : nt.end) << " ndiam\n";
    int nmech = 0;
    for (NrnThreadMembList* tml = nt.tml; tml; tml = tml->next) {
        if (tml->index != patstimtype) {  // skip PatternStim
            ++nmech;
        }
    }

    fh << nmech << " nmech\n";
#if CHKPNTDEBUG
    assert(nmech == ntc.nmech);
#endif

    for (NrnThreadMembList* current_tml = nt.tml; current_tml; current_tml = current_tml->next) {
        if (current_tml->index == patstimtype) {
            continue;
        }
        fh << current_tml->index << "\n";
        fh << current_tml->ml->nodecount << "\n";
    }

    fh << nt._nidata << " nidata\n";
    fh << nt._nvdata << " nvdata\n";
    fh << nt.n_weight << " nweight\n";

    // see comment about parent in node_permute.cpp
    int* pinv_nt = nullptr;
    if (nt._permute) {
        int* d = new int[nt.end];
        pinv_nt = inverse_permute(nt._permute, nt.end);
        for (int i = 0; i < nt.end; ++i) {
            int x = nt._v_parent_index[nt._permute[i]];
            if (x >= 0) {
                d[i] = pinv_nt[x];
            } else {
                d[i] = 0;  // really should be -1;
            }
        }
#if CHKPNTDEBUG
        for (int i = 0; i < nt.end; ++i) {
            assert(d[i] == ntc.parent[i]);
        }
#endif
        fh.write_array<int>(d, nt.end);
        delete[] d;
    } else {
#if CHKPNTDEBUG
        for (int i = 0; i < nt.end; ++i) {
            assert(nt._v_parent_index[i] == ntc.parent[i]);
        }
#endif
        fh.write_array<int>(nt._v_parent_index, nt.end);
        pinv_nt = new int[nt.end];
        for (int i = 0; i < nt.end; ++i) {
            pinv_nt[i] = i;
        }
    }

    data_write(fh, nt._actual_a, nt.end, 1, 0, nt._permute);
    data_write(fh, nt._actual_b, nt.end, 1, 0, nt._permute);

#if CHKPNTDEBUG
    for (int i = 0; i < nt.end; ++i) {
        assert(nt._actual_area[i] == ntc.area[pinv_nt[i]]);
    }
#endif

    data_write(fh, nt._actual_area, nt.end, 1, 0, nt._permute);
    data_write(fh, nt._actual_v, nt.end, 1, 0, nt._permute);

    if (nt._actual_diam) {
        data_write(fh, nt._actual_diam, nt.end, 1, 0, nt._permute);
    }

    auto& memb_func = corenrn.get_memb_funcs();
    // will need the ml_pinv inverse permutation of ml._permute for ions and POINTER
    int** ml_pinv = (int**) ecalloc(memb_func.size(), sizeof(int*));

    for (NrnThreadMembList* current_tml = nt.tml; current_tml; current_tml = current_tml->next) {
        Memb_list* ml = current_tml->ml;
        int type = current_tml->index;
        if (type == patstimtype) {
            continue;
        }
        int cnt = ml->nodecount;
        auto& nrn_prop_param_size_ = corenrn.get_prop_param_size();
        auto& nrn_prop_dparam_size_ = corenrn.get_prop_dparam_size();
        auto& nrn_is_artificial_ = corenrn.get_is_artificial();

        int sz = nrn_prop_param_size_[type];
        int layout = corenrn.get_mech_data_layout()[type];
        int* semantics = memb_func[type].dparam_semantics;

        if (!nrn_is_artificial_[type]) {
            // ml->nodeindices values are permuted according to nt._permute
            // and locations according to ml._permute
            // i.e. according to comment in node_permute.cpp
            // nodelist[p_m[i]] = p[nodelist_original[i]
            // so pinv[nodelist[p_m[i]] = nodelist_original[i]
            int* nd_ix = new int[cnt];
            for (int i = 0; i < cnt; ++i) {
                int ip = ml->_permute ? ml->_permute[i] : i;
                int ipval = ml->nodeindices[ip];
                nd_ix[i] = pinv_nt[ipval];
            }
            fh.write_array<int>(nd_ix, cnt);
            delete[] nd_ix;
        }

        data_write(fh, ml->data, cnt, sz, layout, ml->_permute);

        sz = nrn_prop_dparam_size_[type];
        if (sz) {
            // need to update some values according to Datum semantics.
            int* d = soa2aos(ml->pdata, cnt, sz, layout, ml->_permute);
            std::vector<int> pointer2type;  // voltage or mechanism type (starts empty)
            if (!nrn_is_artificial_[type]) {
                for (int i_instance = 0; i_instance < cnt; ++i_instance) {
                    for (int i = 0; i < sz; ++i) {
                        int ix = i_instance * sz + i;
                        int s = semantics[i];
                        if (s == -1) {  // area
                            int p = pinv_nt[d[ix] - (nt._actual_area - nt._data)];
                            d[ix] = p;         // relative _actual_area
                        } else if (s == -9) {  // diam
                            int p = pinv_nt[d[ix] - (nt._actual_diam - nt._data)];

                            d[ix] = p;         // relative to _actual_diam
                        } else if (s == -5) {  // POINTER
                            // loop over instances, then sz, means that we
                            // visit consistent with natural order of
                            // pointer2type

                            // Relevant code that this has to invert
                            // is permute/node_permute.cpp :: update_pdata_values with
                            // respect to permutation, and
                            // io/phase2.cpp :: Phase2::pdata_relocation
                            // with respect to that AoS -> SoA

                            // Step 1: what mechanism is d[ix] pointing to
                            int ptype = type_of_ntdata(nt, d[ix], i_instance == 0);
                            pointer2type.push_back(ptype);

                            // Step 2: replace d[ix] with AoS index relative to type
                            if (ptype == voltage) {
                                int p = pinv_nt[d[ix] - (nt._actual_v - nt._data)];
                                d[ix] = p;  // relative to _actual_v
                            } else {
                                // Since we know ptype, the situation is
                                // identical to ion below. (which was factored
                                // out into the following function.
                                d[ix] = nrn_original_aos_index(ptype, d[ix], nt, ml_pinv);
                            }
                        } else if (s >= 0 && s < 1000) {  // ion
                            d[ix] = nrn_original_aos_index(s, d[ix], nt, ml_pinv);
                        }
#if CHKPNTDEBUG
                        if (s != -8) {  // WATCH values change
                            assert(d[ix] ==
                                   ntc.mlmap[type]->pdata_not_permuted[i_instance * sz + i]);
                        }
#endif
                    }
                }
            }
            fh.write_array<int>(d, cnt * sz);
            delete[] d;
            size_t s = pointer2type.size();
            fh << s << " npointer\n";
            if (s) {
                fh.write_array<int>(pointer2type.data(), s);
            }
        }
    }

    int nnetcon = nt.n_netcon;

    int* output_vindex = new int[nt.n_presyn];
    double* output_threshold = new double[nt.n_real_output];
    for (int i = 0; i < nt.n_presyn; ++i) {
        PreSyn* ps = nt.presyns + i;
        if (ps->thvar_index_ >= 0) {
            // real cell and index into (permuted) actual_v
            // if any assert fails in this loop then we have faulty understanding
            // of the for (int i = 0; i < nt.n_presyn; ++i) loop in nrn_setup.cpp
            assert(ps->thvar_index_ < nt.end);
            assert(ps->pntsrc_ == nullptr);
            output_threshold[i] = ps->threshold_;
            output_vindex[i] = pinv_nt[ps->thvar_index_];
        } else if (i < nt.n_real_output) {  // real cell without a presyn
            output_threshold[i] = 0.0;      // the way it was set in nrnbbcore_write.cpp
            output_vindex[i] = -1;
        } else {
            Point_process* pnt = ps->pntsrc_;
            assert(pnt);
            int type = pnt->_type;
            int ix = pnt->_i_instance;
            if (nt._ml_list[type]->_permute) {
                // pnt->_i_instance is the permuted index into pnt->_type
                if (!ml_pinv[type]) {
                    Memb_list* ml = nt._ml_list[type];
                    ml_pinv[type] = inverse_permute(ml->_permute, ml->nodecount);
                }
                ix = ml_pinv[type][ix];
            }
            output_vindex[i] = -(ix * 1000 + type);
        }
    }
    fh.write_array<int>(output_vindex, nt.n_presyn);
    fh.write_array<double>(output_threshold, nt.n_real_output);
#if CHKPNTDEBUG
    for (int i = 0; i < nt.n_presyn; ++i) {
        assert(ntc.output_vindex[i] == output_vindex[i]);
    }
    for (int i = 0; i < nt.n_real_output; ++i) {
        assert(ntc.output_threshold[i] == output_threshold[i]);
    }
#endif
    delete[] output_vindex;
    delete[] output_threshold;
    delete[] pinv_nt;

    int synoffset = 0;
    std::vector<int> pnt_offset(memb_func.size(), -1);
    for (NrnThreadMembList* tml = nt.tml; tml; tml = tml->next) {
        int type = tml->index;
        if (corenrn.get_pnt_map()[type] > 0) {
            pnt_offset[type] = synoffset;
            synoffset += tml->ml->nodecount;
        }
    }

    int* pnttype = new int[nnetcon];
    int* pntindex = new int[nnetcon];
    double* delay = new double[nnetcon];
    for (int i = 0; i < nnetcon; ++i) {
        NetCon& nc = nt.netcons[i];
        Point_process* pnt = nc.target_;
        if (pnt == nullptr) {
            // nrn_setup.cpp allows type <=0 which generates nullptr target.
            pnttype[i] = 0;
            pntindex[i] = -1;
        } else {
            pnttype[i] = pnt->_type;

            // todo: this seems most natural, but does not work. Perhaps should look
            // into how pntindex determined in nrnbbcore_write.cpp and change there.
            // int ix = pnt->_i_instance;
            // if (ml_pinv[pnt->_type]) {
            //     ix = ml_pinv[pnt->_type][ix];
            // }

            // follow the inverse of nrn_setup.cpp using pnt_offset computed above.
            int ix = (pnt - nt.pntprocs) - pnt_offset[pnt->_type];
            pntindex[i] = ix;
        }
        delay[i] = nc.delay_;
    }
    fh.write_array<int>(pnttype, nnetcon);
    fh.write_array<int>(pntindex, nnetcon);
    fh.write_array<double>(nt.weights, nt.n_weight);
    fh.write_array<double>(delay, nnetcon);
#if CHKPNTDEBUG
    for (int i = 0; i < nnetcon; ++i) {
        assert(ntc.pnttype[i] == pnttype[i]);
        assert(ntc.pntindex[i] == pntindex[i]);
        assert(ntc.delay[i] == delay[i]);
    }
#endif
    delete[] pnttype;
    delete[] pntindex;
    delete[] delay;

    // BBCOREPOINTER
    int nbcp = 0;
    for (NrnThreadMembList* tml = nt.tml; tml; tml = tml->next) {
        if (corenrn.get_bbcore_read()[tml->index] && tml->index != patstimtype) {
            ++nbcp;
        }
    }

    fh << nbcp << " bbcorepointer\n";
#if CHKPNTDEBUG
    assert(nbcp == ntc.nbcp);
#endif
    nbcp = 0;
    for (NrnThreadMembList* tml = nt.tml; tml; tml = tml->next) {
        if (corenrn.get_bbcore_read()[tml->index] && tml->index != patstimtype) {
            int i = nbcp++;
            int type = tml->index;
            assert(corenrn.get_bbcore_write()[type]);
            Memb_list* ml = tml->ml;
            double* d = nullptr;
            Datum* pd = nullptr;
            int layout = corenrn.get_mech_data_layout()[type];
            int dsz = corenrn.get_prop_param_size()[type];
            int pdsz = corenrn.get_prop_dparam_size()[type];
            int aln_cntml = nrn_soa_padded_size(ml->nodecount, layout);
            fh << type << "\n";
            int icnt = 0;
            int dcnt = 0;
            // data size and allocate
            for (int j = 0; j < ml->nodecount; ++j) {
                int jp = j;
                if (ml->_permute) {
                    jp = ml->_permute[j];
                }
                d = ml->data + nrn_i_layout(jp, ml->nodecount, 0, dsz, layout);
                pd = ml->pdata + nrn_i_layout(jp, ml->nodecount, 0, pdsz, layout);
                (*corenrn.get_bbcore_write()[type])(
                    nullptr, nullptr, &dcnt, &icnt, 0, aln_cntml, d, pd, ml->_thread, &nt, ml, 0.0);
            }
            fh << icnt << "\n";
            fh << dcnt << "\n";
#if CHKPNTDEBUG
            assert(ntc.bcptype[i] == type);
            assert(ntc.bcpicnt[i] == icnt);
            assert(ntc.bcpdcnt[i] == dcnt);
#endif
            int* iArray = nullptr;
            double* dArray = nullptr;
            if (icnt) {
                iArray = new int[icnt];
            }
            if (dcnt) {
                dArray = new double[dcnt];
            }
            icnt = dcnt = 0;
            for (int j = 0; j < ml->nodecount; j++) {
                int jp = j;

                if (ml->_permute) {
                    jp = ml->_permute[j];
                }

                d = ml->data + nrn_i_layout(jp, ml->nodecount, 0, dsz, layout);
                pd = ml->pdata + nrn_i_layout(jp, ml->nodecount, 0, pdsz, layout);

                (*corenrn.get_bbcore_write()[type])(
                    dArray, iArray, &dcnt, &icnt, 0, aln_cntml, d, pd, ml->_thread, &nt, ml, 0.0);
            }

            if (icnt) {
                fh.write_array<int>(iArray, icnt);
                delete[] iArray;
            }

            if (dcnt) {
                fh.write_array<double>(dArray, dcnt);
                delete[] dArray;
            }
            ++i;
        }
    }

    fh << nt.n_vecplay << " VecPlay instances\n";
    for (int i = 0; i < nt.n_vecplay; i++) {
        PlayRecord* pr = (PlayRecord*) nt._vecplay[i];
        int vtype = pr->type();
        int mtype = -1;
        int ix = -1;

        // not as efficient as possible but there should not be too many
        Memb_list* ml = nullptr;
        for (NrnThreadMembList* tml = nt.tml; tml; tml = tml->next) {
            ml = tml->ml;
            int nn = corenrn.get_prop_param_size()[tml->index] * ml->nodecount;
            if (nn && pr->pd_ >= ml->data && pr->pd_ < (ml->data + nn)) {
                mtype = tml->index;
                ix = (pr->pd_ - ml->data);
                break;
            }
        }
        assert(mtype >= 0);
        int icnt, isz;
        nrn_inverse_i_layout(ix,
                             icnt,
                             ml->nodecount,
                             isz,
                             corenrn.get_prop_param_size()[mtype],
                             corenrn.get_mech_data_layout()[mtype]);
        if (ml_pinv[mtype]) {
            icnt = ml_pinv[mtype][icnt];
        }
        ix = nrn_i_layout(
            icnt, ml->nodecount, isz, corenrn.get_prop_param_size()[mtype], AOS_LAYOUT);

        fh << vtype << "\n";
        fh << mtype << "\n";
        fh << ix << "\n";
#if CHKPNTDEBUG
        assert(ntc.vtype[i] == vtype);
        assert(ntc.mtype[i] == mtype);
        assert(ntc.vecplay_ix[i] == ix);
#endif
        if (vtype == VecPlayContinuousType) {
            VecPlayContinuous* vpc = (VecPlayContinuous*) pr;
            int sz = vpc->y_.size();
            fh << sz << "\n";
            fh.write_array<double>(vpc->y_.data(), sz);
            fh.write_array<double>(vpc->t_.data(), sz);
        } else {
            std::cerr << "Error checkpointing vecplay type" << std::endl;
            assert(0);
        }
    }

    for (size_t i = 0; i < memb_func.size(); ++i) {
        if (ml_pinv[i]) {
            delete[] ml_pinv[i];
        }
    }
    free(ml_pinv);

    write_tqueue(nt, fh);
    fh.close();
}

void CheckPoints::write_time() const {
    FileHandler f;
    auto filename = get_save_path() + "/time.dat";
    f.open(filename, std::ios::out);
    f.write_array(&t, 1);
    f.close();
}

// A call to finitialize must be avoided after restoring the checkpoint
// as that would change all states to a voltage clamp initialization.
// Nevertheless t and some spike exchange and other computer state needs to
// be initialized.
// Also it is occasionally the case that nrn_init allocates data so we
// need to call it but avoid the internal call to initmodel.
// Consult finitialize.c to help decide what should be here
bool CheckPoints::initialize() {
    dt2thread(-1.);
    nrn_thread_table_check();
    nrn_spike_exchange_init();

    allocate_data_in_mechanism_nrn_init();

    // if PatternStim exists, needs initialization
    for (NrnThreadMembList* tml = nrn_threads[0].tml; tml; tml = tml->next) {
        if (tml->index == patstimtype && patstim_index >= 0 && patstim_te > 0.0) {
            Memb_list* ml = tml->ml;
            checkpoint_restore_patternstim(patstim_index,
                                           patstim_te,
                                           /* below correct only for AoS */
                                           0,
                                           ml->nodecount,
                                           ml->data,
                                           ml->pdata,
                                           ml->_thread,
                                           nrn_threads,
                                           ml,
                                           0.0);
            break;
        }
    }

    // Check that bbcore_write is defined if we want to use checkpoint
    for (NrnThreadMembList* tml = nrn_threads[0].tml; tml; tml = tml->next) {
        auto type = tml->index;
        if (corenrn.get_bbcore_read()[type] && !corenrn.get_bbcore_write()[type]) {
            auto memb_func = corenrn.get_memb_func(type);
            fprintf(stderr,
                    "Checkpoint is requested involving BBCOREPOINTER but there is no bbcore_write"
                    " function for %s\n",
                    memb_func.sym);
            assert(corenrn.get_bbcore_write()[type]);
        }
    }


    return restored;
}

template <typename T>
T* CheckPoints::soa2aos(T* data, int cnt, int sz, int layout, int* permute) const {
    // inverse of F -> data. Just a copy if layout=1. If SoA,
    // original file order depends on padding and permutation.
    // Good for a, b, area, v, diam, Memb_list.data, or anywhere values do not change.
    T* d = new T[cnt * sz];
    if (layout == Layout::AoS) {
        for (int i = 0; i < cnt * sz; ++i) {
            d[i] = data[i];
        }
    } else if (layout == Layout::SoA) {
        int align_cnt = nrn_soa_padded_size(cnt, layout);
        for (int i = 0; i < cnt; ++i) {
            int ip = i;
            if (permute) {
                ip = permute[i];
            }
            for (int j = 0; j < sz; ++j) {
                d[i * sz + j] = data[ip + j * align_cnt];
            }
        }
    }
    return d;
}

template <typename T>
void CheckPoints::data_write(FileHandler& F, T* data, int cnt, int sz, int layout, int* permute)
    const {
    T* d = soa2aos(data, cnt, sz, layout, permute);
    F.write_array<T>(d, cnt * sz);
    delete[] d;
}

NrnThreadChkpnt* nrnthread_chkpnt;

int patstimtype;

void CheckPoints::write_tqueue(TQItem* q, NrnThread& nt, FileHandler& fh) const {
    DiscreteEvent* d = (DiscreteEvent*) q->data_;

    // printf("  p %.20g %d\n", q->t_, d->type());
    // d->pr("", q->t_, net_cvode_instance);

    if (!d->require_checkpoint()) {
        return;
    }

    fh << d->type() << "\n";
    fh.write_array(&q->t_, 1);

    switch (d->type()) {
        case NetConType: {
            NetCon* nc = (NetCon*) d;
            assert(nc >= nt.netcons && (nc < (nt.netcons + nt.n_netcon)));
            fh << (nc - nt.netcons) << "\n";
            break;
        }
        case SelfEventType: {
            SelfEvent* se = (SelfEvent*) d;
            fh << int(se->target_->_type) << "\n";
            fh << se->target_ - nt.pntprocs << "\n";  // index of nrnthread.pntprocs
            fh << se->target_->_i_instance << "\n";   // not needed except for assert check
            fh.write_array(&se->flag_, 1);
            fh << (se->movable_ - nt._vdata) << "\n";  // DANGEROUS?
            fh << se->weight_index_ << "\n";
            // printf("    %d %ld %d %g %ld %d\n", se->target_->_type, se->target_ - nt.pntprocs,
            // se->target_->_i_instance, se->flag_, se->movable_ - nt._vdata, se->weight_index_);
            break;
        }
        case PreSynType: {
            PreSyn* ps = (PreSyn*) d;
            assert(ps >= nt.presyns && (ps < (nt.presyns + nt.n_presyn)));
            fh << (ps - nt.presyns) << "\n";
            break;
        }
        case NetParEventType: {
            // nothing extra to write
            break;
        }
        case PlayRecordEventType: {
            PlayRecord* pr = ((PlayRecordEvent*) d)->plr_;
            fh << pr->type() << "\n";
            if (pr->type() == VecPlayContinuousType) {
                VecPlayContinuous* vpc = (VecPlayContinuous*) pr;
                int ix = -1;
                for (int i = 0; i < nt.n_vecplay; ++i) {
                    // if too many for fast search, put ix in the instance
                    if (nt._vecplay[i] == (void*) vpc) {
                        ix = i;
                        break;
                    }
                }
                assert(ix >= 0);
                fh << ix << "\n";
            } else {
                assert(0);
            }
            break;
        }
        default: {
            // In particular, InputPreSyn does not appear in tqueue as it
            // immediately fans out to NetCon.
            assert(0);
            break;
        }
    }
}

void CheckPoints::restore_tqitem(int type,
                                 std::shared_ptr<Phase2::EventTypeBase> event,
                                 NrnThread& nt) {
    // printf("restore tqitem type=%d time=%.20g\n", type, time);

    switch (type) {
        case NetConType: {
            auto e = static_cast<Phase2::NetConType_*>(event.get());
            // printf("  NetCon %d\n", netcon_index);
            NetCon* nc = nt.netcons + e->netcon_index;
            nc->send(e->time, net_cvode_instance, &nt);
            break;
        }
        case SelfEventType: {
            auto e = static_cast<Phase2::SelfEventType_*>(event.get());
            if (e->target_type == patstimtype) {
                if (nt.id == 0) {
                    patstim_te = e->time;
                }
                break;
            }
            Point_process* pnt = nt.pntprocs + e->point_proc_instance;
            // printf("  SelfEvent %d %d %d %g %d %d\n", target_type, point_proc_instance,
            // target_instance, flag, movable, weight_index);
            nrn_assert(e->target_instance == pnt->_i_instance);
            nrn_assert(e->target_type == pnt->_type);
            net_send(nt._vdata + e->movable, e->weight_index, pnt, e->time, e->flag);
            break;
        }
        case PreSynType: {
            auto e = static_cast<Phase2::PreSynType_*>(event.get());
            // printf("  PreSyn %d\n", presyn_index);
            PreSyn* ps = nt.presyns + e->presyn_index;
            int gid = ps->output_index_;
            ps->output_index_ = -1;
            ps->send(e->time, net_cvode_instance, &nt);
            ps->output_index_ = gid;
            break;
        }
        case NetParEventType: {
            // nothing extra to read
            // printf("  NetParEvent\n");
            break;
        }
        case PlayRecordEventType: {
            auto e = static_cast<Phase2::PlayRecordEventType_*>(event.get());
            VecPlayContinuous* vpc = (VecPlayContinuous*) (nt._vecplay[e->vecplay_index]);
            vpc->e_->send(e->time, net_cvode_instance, &nt);
            break;
        }
        default: {
            assert(0);
            break;
        }
    }
}

void CheckPoints::write_tqueue(NrnThread& nt, FileHandler& fh) const {
    // VecPlayContinuous
    fh << nt.n_vecplay << " VecPlayContinuous state\n";
    for (int i = 0; i < nt.n_vecplay; ++i) {
        VecPlayContinuous* vpc = (VecPlayContinuous*) nt._vecplay[i];
        fh << vpc->last_index_ << "\n";
        fh << vpc->discon_index_ << "\n";
        fh << vpc->ubound_index_ << "\n";
    }

    // PatternStim
    int patstim_index = -1;
    for (NrnThreadMembList* tml = nrn_threads[0].tml; tml; tml = tml->next) {
        if (tml->index == patstimtype) {
            Memb_list* ml = tml->ml;
            patstim_index = checkpoint_save_patternstim(
                /* below correct only for AoS */
                0,
                ml->nodecount,
                ml->data,
                ml->pdata,
                ml->_thread,
                nrn_threads,
                ml,
                0.0);
            break;
        }
    }
    fh << patstim_index << " PatternStim\n";

    // Avoid extra spikes due to some presyn voltages above threshold
    fh << -1 << " Presyn ConditionEvent flags\n";
    for (int i = 0; i < nt.n_presyn; ++i) {
        // PreSyn.flag_ not used. HPC memory utilizes PreSynHelper.flag_ array
        fh << nt.presyns_helper[i].flag_ << "\n";
    }

    NetCvodeThreadData& ntd = net_cvode_instance->p[nt.id];
    // printf("write_tqueue %d %p\n", nt.id, ndt.tqe_);
    TQueue<QTYPE>* tqe = ntd.tqe_;
    TQItem* q;

    fh << -1 << " TQItems from atomic_dq\n";
    while ((q = tqe->atomic_dq(1e20)) != nullptr) {
        write_tqueue(q, nt, fh);
    }
    fh << 0 << "\n";
    fh << -1 << " TQItemsfrom binq_\n";
    for (q = tqe->binq_->first(); q; q = tqe->binq_->next(q)) {
        write_tqueue(q, nt, fh);
    }
    fh << 0 << "\n";
}

// Read a tqueue/checkpoint
// int :: should be equal to the previous n_vecplay
// n_vecplay:
//   int: last_index
//   int: discon_index
//   int: ubound_index
// int: patstim_index
// int: should be -1
// n_presyn:
//   int: flags of presyn_helper
// int: should be -1
// null terminated:
//   int: type
//   array of size 1:
//     double: time
//   ... depends of the type
// int: should be -1
// null terminated:
//   int: TO BE DEFINED
//   ... depends of the type
void CheckPoints::restore_tqueue(NrnThread& nt, const Phase2& p2) {
    restored = true;

    for (int i = 0; i < nt.n_vecplay; ++i) {
        VecPlayContinuous* vpc = (VecPlayContinuous*) nt._vecplay[i];
        auto& vec = p2.vec_play_continuous[i];
        vpc->last_index_ = vec.last_index;
        vpc->discon_index_ = vec.discon_index;
        vpc->ubound_index_ = vec.ubound_index;
    }

    // PatternStim
    patstim_index = p2.patstim_index;  // PatternStim
    if (nt.id == 0) {
        patstim_te = -1.0;  // changed if relevant SelfEvent;
    }

    for (int i = 0; i < nt.n_presyn; ++i) {
        nt.presyns_helper[i].flag_ = p2.preSynConditionEventFlags[i];
    }

    for (const auto& event: p2.events) {
        restore_tqitem(event.first, event.second, nt);
    }
}

}  // namespace coreneuron


================================================
FILE: coreneuron/io/nrn_checkpoint.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#pragma once

#include "coreneuron/io/phase2.hpp"

namespace coreneuron {
struct NrnThread;
class FileHandler;

class CheckPoints {
  public:
    CheckPoints(const std::string& save, const std::string& restore);
    std::string get_save_path() const {
        return save_;
    }
    std::string get_restore_path() const {
        return restore_;
    }
    bool should_save() const {
        return !save_.empty();
    }
    bool should_restore() const {
        return !restore_.empty();
    }
    double restore_time() const;
    void write_checkpoint(NrnThread* nt, int nb_threads) const;
    /* return true if special checkpoint initialization carried out and
       one should not do finitialize
     */
    bool initialize();
    void restore_tqueue(NrnThread&, const Phase2& p2);

  private:
    const std::string save_;
    const std::string restore_;
    bool restored;
    int patstim_index;
    double patstim_te;

    void write_time() const;
    void write_phase2(NrnThread& nt) const;

    template <typename T>
    void data_write(FileHandler& F, T* data, int cnt, int sz, int layout, int* permute) const;
    template <typename T>
    T* soa2aos(T* data, int cnt, int sz, int layout, int* permute) const;
    void write_tqueue(TQItem* q, NrnThread& nt, FileHandler& fh) const;
    void write_tqueue(NrnThread& nt, FileHandler& fh) const;
    void restore_tqitem(int type, std::shared_ptr<Phase2::EventTypeBase> event, NrnThread& nt);
};


int* inverse_permute(int* p, int n);
void nrn_inverse_i_layout(int i, int& icnt, int cnt, int& isz, int sz, int layout);

extern int patstimtype;

#ifndef CHKPNTDEBUG
#define CHKPNTDEBUG 0
#endif

#if CHKPNTDEBUG
// Factored out from checkpoint changes to nrnoc/multicore.h and nrnoc/nrnoc_ml.h
// Put here to avoid potential issues with gpu transfer and to allow
// debugging comparison with respect to checkpoint writing to verify that
// data is same as on reading when inverse transforming SoA and permutations.
// Following is a mixture of substantive information which is lost during
// nrn_setup.cpp and debugging only information which is retrievable from
// NrnThread and Memb_list. Ideally, this should all go away

struct Memb_list_chkpnt {
    // debug only
    double* data_not_permuted;
    Datum* pdata_not_permuted;
    int* nodeindices_not_permuted;
};

#endif  // CHKPNTDEBUG but another section for it below

struct NrnThreadChkpnt {
    int file_id;

#if CHKPNTDEBUG
    int nmech;
    double* area;
    int* parent;
    Memb_list_chkpnt** mlmap;

    int n_outputgids;
    int* output_vindex;
    double* output_threshold;

    int* pnttype;
    int* pntindex;
    double* delay;

    // BBCOREPOINTER
    int nbcp;
    int* bcptype;
    int* bcpicnt;
    int* bcpdcnt;

    // VecPlay
    int* vtype;
    int* mtype;
    int* vecplay_ix;
#endif  // CHKPNTDEBUG
};

extern NrnThreadChkpnt* nrnthread_chkpnt;
}  // namespace coreneuron


================================================
FILE: coreneuron/io/nrn_filehandler.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#include <iostream>
#include "coreneuron/io/nrn_filehandler.hpp"
#include "coreneuron/nrnconf.h"

namespace coreneuron {
FileHandler::FileHandler(const std::string& filename)
    : chkpnt(0)
    , stored_chkpnt(0) {
    this->open(filename);
}

bool FileHandler::file_exist(const std::string& filename) {
    struct stat buffer;
    return (stat(filename.c_str(), &buffer) == 0);
}

void FileHandler::open(const std::string& filename, std::ios::openmode mode) {
    nrn_assert((mode & (std::ios::in | std::ios::out)));
    close();
    F.open(filename, mode | std::ios::binary);
    if (!F.is_open()) {
        std::cerr << "cannot open file '" << filename << "'" << std::endl;
    }
    nrn_assert(F.is_open());
    current_mode = mode;
    char version[256];
    if (current_mode & std::ios::in) {
        F.getline(version, sizeof(version));
        nrn_assert(!F.fail());
        check_bbcore_write_version(version);
    }
    if (current_mode & std::ios::out) {
        F << bbcore_write_version << "\n";
    }
}

bool FileHandler::eof() {
    if (F.eof()) {
        return true;
    }
    int a = F.get();
    if (F.eof()) {
        return true;
    }
    F.putback(a);
    return false;
}

int FileHandler::read_int() {
    char line_buf[max_line_length];

    F.getline(line_buf, sizeof(line_buf));
    nrn_assert(!F.fail());

    int i;
    int n_scan = sscanf(line_buf, "%d", &i);
    nrn_assert(n_scan == 1);

    return i;
}

void FileHandler::read_mapping_count(int* gid, int* nsec, int* nseg, int* nseclist) {
    char line_buf[max_line_length];

    F.getline(line_buf, sizeof(line_buf));
    nrn_assert(!F.fail());

    /** mapping file has extra strings, ignore those */
    int n_scan = sscanf(line_buf, "%d %d %d %d", gid, nsec, nseg, nseclist);
    nrn_assert(n_scan == 4);
}

void FileHandler::read_mapping_cell_count(int* count) {
    *count = read_int();
}

void FileHandler::read_checkpoint_assert() {
    char line_buf[max_line_length];

    F.getline(line_buf, sizeof(line_buf));
    nrn_assert(!F.fail());

    int i;
    int n_scan = sscanf(line_buf, "chkpnt %d\n", &i);
    if (n_scan != 1) {
        fprintf(stderr, "no chkpnt line for %d\n", chkpnt);
    }
    nrn_assert(n_scan == 1);
    if (i != chkpnt) {
        fprintf(stderr, "file chkpnt %d != expected %d\n", i, chkpnt);
    }
    nrn_assert(i == chkpnt);
    ++chkpnt;
}

void FileHandler::close() {
    F.close();
}
}  // namespace coreneuron


================================================
FILE: coreneuron/io/nrn_filehandler.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#pragma once

#include <iostream>
#include <fstream>
#include <vector>
#include <sys/stat.h>

#include "coreneuron/utils/nrn_assert.h"

namespace coreneuron {
/** Encapsulate low-level reading of coreneuron input data files.
 *
 * Error handling is simple: abort()!
 *
 * Reader will abort() if native integer size is not 4 bytes.
 *
 * All automatic allocations performed by read_int_array()
 * and read_dbl_array() methods use new [].
 */

// @todo: remove this static buffer
const int max_line_length = 1024;

class FileHandler {
    std::fstream F;                        //!< File stream associated with reader.
    std::ios_base::openmode current_mode;  //!< File open mode (not stored in fstream)
    int chkpnt;                            //!< Current checkpoint number state.
    int stored_chkpnt;                     //!< last "remembered" checkpoint number state.
    /** Read a checkpoint line, bump our chkpnt counter, and assert equality.
     *
     * Checkpoint information is represented by a sequence "checkpt %d\n"
     * where %d is a scanf-compatible representation of the checkpoint
     * integer.
     */
    void read_checkpoint_assert();

    // FileHandler is not copyable.
    FileHandler(const FileHandler&) = delete;
    FileHandler& operator=(const FileHandler&) = delete;

  public:
    FileHandler()
        : chkpnt(0)
        , stored_chkpnt(0) {}

    explicit FileHandler(const std::string& filename);

    /** Preserving chkpnt state, move to a new file. */
    void open(const std::string& filename, std::ios::openmode mode = std::ios::in);

    /** Is the file not open */
    bool fail() const {
        return F.fail();
    }

    static bool file_exist(const std::string& filename);

    /** nothing more to read */
    bool eof();

    /** Query chkpnt state. */
    int checkpoint() const {
        return chkpnt;
    }

    /** Explicitly override chkpnt state. */
    void checkpoint(int c) {
        chkpnt = c;
    }

    /** Record current chkpnt state. */
    void record_checkpoint() {
        stored_chkpnt = chkpnt;
    }

    /** Restored last recorded chkpnt state. */
    void restore_checkpoint() {
        chkpnt = stored_chkpnt;
    }

    /** Parse a single integer entry.
     *
     * Single integer entries are represented by their standard
     * (C locale) text representation, followed by a newline.
     * Extraneous characters following the integer but preceding
     * the newline are ignore.
     */
    int read_int();

    /** Parse a neuron mapping count entries
     *
     * Reads neuron mapping info which is represented by
     * gid, #sections, #segments, #section lists
     */
    void read_mapping_count(int* gid, int* nsec, int* nseg, int* nseclist);

    /** Reads number of cells in parsing file */
    void read_mapping_cell_count(int* count);

    /** Parse a neuron section segment mapping
     *
     * Read count no of mappings for section to segment
     */
    template <typename T>
    int read_mapping_info(T* mapinfo) {
        int nsec, nseg, n_scan;
        char line_buf[max_line_length], name[max_line_length];

        F.getline(line_buf, sizeof(line_buf));
        n_scan = sscanf(line_buf, "%s %d %d", name, &nsec, &nseg);

        nrn_assert(n_scan == 3);

        mapinfo->name = std::string(name);

        if (nseg) {
            std::vector<int> sec, seg;
            sec.reserve(nseg);
            seg.reserve(nseg);

            read_array<int>(&sec[0], nseg);
            read_array<int>(&seg[0], nseg);

            for (int i = 0; i < nseg; i++) {
                mapinfo->add_segment(sec[i], seg[i]);
            }
        }
        return nseg;
    }

    /** Defined flag values for parse_array() */
    enum parse_action { read, seek };

    /** Generic parse function for an array of fixed length.
     *
     * \tparam T the array element type: may be \c int or \c double.
     * \param p pointer to the target in memory for reading into.
     * \param count number of items of type \a T to parse.
     * \param action whether to validate and skip (\c seek) or
     *    copy array into memory (\c read).
     * \return the supplied pointer value.
     *
     * Error if \a count is non-zero, \a flag is \c read, and
     * the supplied pointer \p is null.
     *
     * Arrays are represented by a checkpoint line followed by
     * the array items in increasing index order, in the native binary
     * representation of the writing process.
     */
    template <typename T>
    inline T* parse_array(T* p, size_t count, parse_action flag) {
        if (count > 0 && flag != seek)
            nrn_assert(p != 0);

        read_checkpoint_assert();
        switch (flag) {
            case seek:
                F.seekg(count * sizeof(T), std::ios_base::cur);
                break;
            case read:
                F.read((char*) p, count * sizeof(T));
                break;
        }

        nrn_assert(!F.fail());
        return p;
    }

    // convenience interfaces:

    /** Read an integer array of fixed length. */
    template <typename T>
    inline T* read_array(T* p, size_t count) {
        return parse_array(p, count, read);
    }

    /** Allocate and read an integer array of fixed length. */
    template <typename T>
    inline T* read_array(size_t count) {
        return parse_array(new T[count], count, read);
    }

    template <typename T>
    inline std::vector<T> read_vector(size_t count) {
        std::vector<T> vec(count);
        parse_array(vec.data(), count, read);
        return vec;
    }

    /** Close currently open file. */
    void close();

    /** Write an 1D array **/
    template <typename T>
    void write_array(T* p, size_t nb_elements) {
        nrn_assert(F.is_open());
        nrn_assert(current_mode & std::ios::out);
        write_checkpoint();
        F.write((const char*) p, nb_elements * (sizeof(T)));
        nrn_assert(!F.fail());
    }

    /** Write a padded array. nb_elements is number of elements to write per line,
     * line_width is full size of a line in nb elements**/
    template <typename T>
    void write_array(T* p,
                     size_t nb_elements,
                     size_t line_width,
                     size_t nb_lines,
                     bool to_transpose = false) {
        nrn_assert(F.is_open());
        nrn_assert(current_mode & std::ios::out);
        write_checkpoint();
        T* temp_cpy = new T[nb_elements * nb_lines];

        if (to_transpose) {
            for (size_t i = 0; i < nb_lines; i++) {
                for (size_t j = 0; j < nb_elements; j++) {
                    temp_cpy[i + j * nb_lines] = p[i * line_width + j];
                }
            }
        } else {
            memcpy(temp_cpy, p, nb_elements * sizeof(T) * nb_lines);
        }
        // AoS never use padding, SoA is translated above, so one write
        // operation is enought in both cases
        F.write((const char*) temp_cpy, nb_elements * sizeof(T) * nb_lines);
        nrn_assert(!F.fail());
        delete[] temp_cpy;
    }

    template <typename T>
    FileHandler& operator<<(const T& scalar) {
        nrn_assert(F.is_open());
        nrn_assert(current_mode & std::ios::out);
        F << scalar;
        nrn_assert(!F.fail());
        return *this;
    }

  private:
    /* write_checkpoint is callable only for our internal uses, making it accesible to user, makes
     * file format unpredictable */
    void write_checkpoint() {
        F << "chkpnt " << chkpnt++ << "\n";
    }
};
}  // namespace coreneuron


================================================
FILE: coreneuron/io/nrn_setup.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#include <algorithm>
#include <vector>
#include <map>
#include <cstring>
#include <mutex>

#include "coreneuron/apps/corenrn_parameters.hpp"
#include "coreneuron/nrnconf.h"
#include "coreneuron/utils/randoms/nrnran123.h"
#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/nrniv/nrniv_decl.h"
#include "coreneuron/sim/fast_imem.hpp"
#include "coreneuron/network/multisend.hpp"
#include "coreneuron/utils/nrn_assert.h"
#include "coreneuron/utils/nrnmutdec.hpp"
#include "coreneuron/utils/memory.h"
#include "coreneuron/utils/utils.hpp"
#include "coreneuron/mpi/nrnmpi.h"
#include "coreneuron/mpi/core/nrnmpi.hpp"
#include "coreneuron/io/nrn_setup.hpp"
#include "coreneuron/network/partrans.hpp"
#include "coreneuron/io/nrn_checkpoint.hpp"
#include "coreneuron/permute/node_permute.h"
#include "coreneuron/permute/cellorder.hpp"
#include "coreneuron/io/nrnsection_mapping.hpp"
#include "coreneuron/utils/nrnoc_aux.hpp"
#include "coreneuron/io/phase1.hpp"
#include "coreneuron/io/phase2.hpp"
#include "coreneuron/io/mech_report.h"
#include "coreneuron/io/reports/nrnreport.hpp"

// callbacks into nrn/src/nrniv/nrnbbcore_write.cpp
#include "coreneuron/sim/fast_imem.hpp"
#include "coreneuron/coreneuron.hpp"


/// --> Coreneuron
bool corenrn_embedded;
int corenrn_embedded_nthread;

void (*nrn2core_group_ids_)(int*);

extern "C" {
SetupTransferInfo* (*nrn2core_get_partrans_setup_info_)(int ngroup,
                                                        int cn_nthread,
                                                        size_t cn_sidt_size);
}

void (*nrn2core_get_trajectory_requests_)(int tid,
                                          int& bsize,
                                          int& n_pr,
                                          void**& vpr,
                                          int& n_trajec,
                                          int*& types,
                                          int*& indices,
                                          double**& pvars,
                                          double**& varrays);

void (*nrn2core_trajectory_values_)(int tid, int n_pr, void** vpr, double t);

void (*nrn2core_trajectory_return_)(int tid, int n_pr, int bsize, int vecsz, void** vpr, double t);

int (*nrn2core_all_spike_vectors_return_)(std::vector<double>& spikevec, std::vector<int>& gidvec);

void (*nrn2core_all_weights_return_)(std::vector<double*>& weights);

// file format defined in cooperation with nrncore/src/nrniv/nrnbbcore_write.cpp
// single integers are ascii one per line. arrays are binary int or double
// Note that regardless of the gid contents of a group, since all gids are
// globally unique, a filename convention which involves the first gid
// from the group is adequate. Also note that balance is carried out from a
// per group perspective and launching a process consists of specifying
// a list of group ids (first gid of the group) for each process.
//
// <firstgid>_1.dat
// n_presyn, n_netcon
// output_gids (npresyn) with -(type+1000*index) for those acell with no gid
// netcon_srcgid (nnetcon) -(type+1000*index) refers to acell with no gid
//                         -1 means the netcon has no source (not implemented)
// Note that the negative gids are only thread unique and not process unique.
// We create a thread specific hash table for the negative gids for each thread
// when <firstgid>_1.dat is read and then destroy it after <firstgid>_2.dat
// is finished using it.  An earlier implementation which attempted to
// encode the thread number into the negative gid
// (i.e -ith - nth*(type +1000*index)) failed due to not large enough
// integer domain size.
// Note that for file transfer it is an error if a negative srcgid is
// not in the same thread as the target. This is because there it may
// not be the case that threads in a NEURON process end up on same process
// in CoreNEURON. NEURON will raise an error if this
// is the case. However, for direct memory transfer, it is allowed that
// a negative srcgid may be in a different thread than the target. So
// nrn2core_get_dat1 has a last arg netcon_negsrcgid_tid that specifies
// for the negative gids in netcon_srcgid (in that order) the source thread.
//
// <firstgid>_2.dat
// n_real_cell, n_output, n_real_output, nnode
// ndiam - 0 if no mechanism has dparam with diam semantics, or nnode
// nmech - includes artcell mechanisms
// for the nmech tml mechanisms
//   type, nodecount
// nidata, nvdata, nweight
// v_parent_index (nnode)
// actual_a, b, area, v (nnode)
// diam - if ndiam > 0. Note that only valid diam is for those nodes with diam semantics mechanisms
// for the nmech tml mechanisms
//   nodeindices (nodecount) but only if not an artificial cell
//   data (nodecount*param_size)
//   pdata (nodecount*dparam_size) but only if dparam_size > 0 on this side.
// output_vindex (n_presyn) >= 0 associated with voltages -(type+1000*index) for acell
// output_threshold (n_real_output)
// netcon_pnttype (nnetcon) <=0 if a NetCon does not have a target.
// netcon_pntindex (nnetcon)
// weights (nweight)
// delays (nnetcon)
// for the nmech tml mechanisms that have a nrn_bbcore_write method
//   type
//   icnt
//   dcnt
//   int array (number specified by the nodecount nrn_bbcore_write
//     to be intepreted by this side's nrn_bbcore_read method)
//   double array
// #VectorPlay_instances, for each of these instances
// 4 (VecPlayContinuousType)
// mtype
// index (from Memb_list.data)
// vecsize
// yvec
// tvec
//
// The critical issue requiring careful attention is that a coreneuron
// process reads many coreneuron thread files with a result that, although
// the conceptual
// total n_pre is the sum of all the n_presyn from each thread as is the
// total number of output_gid, the number of InputPreSyn instances must
// be computed here from a knowledge of all thread's netcon_srcgid after
// all thread's output_gids have been registered. We want to save the
// "individual allocation of many small objects" memory overhead by
// allocating a single InputPreSyn array for the entire process.
// For this reason cellgroup data are divided into two separate
// files with the first containing output_gids and netcon_srcgid which are
// stored in the nt.presyns array and nt.netcons array respectively
namespace coreneuron {
static OMP_Mutex mut;

/// Vector of maps for negative presyns
std::vector<std::map<int, PreSyn*>> neg_gid2out;
/// Maps for ouput and input presyns
std::map<int, PreSyn*> gid2out;
std::map<int, InputPreSyn*> gid2in;

/// InputPreSyn.nc_index_ to + InputPreSyn.nc_cnt_ give the NetCon*
std::vector<NetCon*> netcon_in_presyn_order_;

/// Only for setup vector of netcon source gids
std::vector<int*> nrnthreads_netcon_srcgid;

/// If a nrnthreads_netcon_srcgid is negative, need to determine the thread when
/// in order to use the correct neg_gid2out[tid] map
std::vector<std::vector<int>> nrnthreads_netcon_negsrcgid_tid;

/* read files.dat file and distribute cellgroups to all mpi ranks */
void nrn_read_filesdat(int& ngrp, int*& grp, const char* filesdat) {
    patstimtype = nrn_get_mechtype("PatternStim");
    if (corenrn_embedded) {
        ngrp = corenrn_embedded_nthread;
        grp = new int[ngrp + 1];
        (*nrn2core_group_ids_)(grp);
        return;
    }

    FILE* fp = fopen(filesdat, "r");

    if (!fp) {
        nrn_fatal_error("No input file ( %s ) with nrnthreads, exiting...", filesdat);
    }

    char version[256];
    nrn_assert(fscanf(fp, "%s\n", version) == 1);
    check_bbcore_write_version(version);

    int iNumFiles = 0;
    nrn_assert(fscanf(fp, "%d\n", &iNumFiles) == 1);

    // temporary strategem to figure out if model uses gap junctions while
    // being backward compatible
    if (iNumFiles == -1) {
        nrn_assert(fscanf(fp, "%d\n", &iNumFiles) == 1);
        nrn_have_gaps = true;
        if (nrnmpi_myid == 0) {
            printf("Model uses gap junctions\n");
        }
    }

    if (nrnmpi_numprocs > iNumFiles && nrnmpi_myid == 0) {
        printf(
            "Info : The number of input datasets are less than ranks, some ranks will be idle!\n");
    }

    ngrp = 0;
    grp = new int[iNumFiles / nrnmpi_numprocs + 1];

    // irerate over gids in files.dat
    for (int iNum = 0; iNum < iNumFiles; ++iNum) {
        int iFile;

        nrn_assert(fscanf(fp, "%d\n", &iFile) == 1);
        if ((iNum % nrnmpi_numprocs) == nrnmpi_myid) {
            grp[ngrp] = iFile;
            ngrp++;
        }
    }

    fclose(fp);
}

void netpar_tid_gid2ps(int tid, int gid, PreSyn** ps, InputPreSyn** psi) {
    /// for gid < 0 returns the PreSyn* in the thread (tid) specific map.
    *ps = nullptr;
    *psi = nullptr;

    if (gid >= 0) {
        auto gid2out_it = gid2out.find(gid);
        if (gid2out_it != gid2out.end()) {
            *ps = gid2out_it->second;
        } else {
            auto gid2in_it = gid2in.find(gid);
            if (gid2in_it != gid2in.end()) {
                *psi = gid2in_it->second;
            }
        }
    } else {
        auto gid2out_it = neg_gid2out[tid].find(gid);
        if (gid2out_it != neg_gid2out[tid].end()) {
            *ps = gid2out_it->second;
        }
    }
}

void determine_inputpresyn() {
    // allocate the process wide InputPreSyn array
    // all the output_gid have been registered and associated with PreSyn.
    // now count the needed InputPreSyn by filling the netpar::gid2in map
    gid2in.clear();

    // now have to fill the new table
    // do not need to worry about negative gid overlap since only use
    // it to search for PreSyn in this thread.

    std::vector<InputPreSyn*> inputpresyn_;

    for (int ith = 0; ith < nrn_nthread; ++ith) {
        NrnThread& nt = nrn_threads[ith];
        // associate gid with InputPreSyn and increase PreSyn and InputPreSyn count
        nt.n_input_presyn = 0;
        // if single thread or file transfer then definitely empty.
        std::vector<int>& negsrcgid_tid = nrnthreads_netcon_negsrcgid_tid[ith];
        size_t i_tid = 0;
        for (int i = 0; i < nt.n_netcon; ++i) {
            int gid = nrnthreads_netcon_srcgid[ith][i];
            if (gid >= 0) {
                /// If PreSyn or InputPreSyn is already in the map
                auto gid2out_it = gid2out.find(gid);
                if (gid2out_it != gid2out.end()) {
                    /// Increase PreSyn count
                    ++gid2out_it->second->nc_cnt_;
                    continue;
                }
                auto gid2in_it = gid2in.find(gid);
                if (gid2in_it != gid2in.end()) {
                    /// Increase InputPreSyn count
                    ++gid2in_it->second->nc_cnt_;
                    continue;
                }

                /// Create InputPreSyn and increase its count
                InputPreSyn* psi = new InputPreSyn;
                ++psi->nc_cnt_;
                gid2in[gid] = psi;
                inputpresyn_.push_back(psi);
                ++nt.n_input_presyn;
            } else {
                int tid = nt.id;
                if (!negsrcgid_tid.empty()) {
                    tid = negsrcgid_tid[i_tid++];
                }
                auto gid2out_it = neg_gid2out[tid].find(gid);
                if (gid2out_it != neg_gid2out[tid].end()) {
                    /// Increase negative PreSyn count
                    ++gid2out_it->second->nc_cnt_;
                }
            }
        }
    }

    // now, we can opportunistically create the NetCon* pointer array
    // to save some memory overhead for
    // "large number of small array allocation" by
    // counting the number of NetCons each PreSyn and InputPreSyn point to.
    // Conceivably the nt.netcons could become a process global array
    // in which case the NetCon* pointer array could become an integer index
    // array. More speculatively, the index array could be eliminated itself
    // if the process global NetCon array were ordered properly but that
    // would interleave NetCon from different threads. Not a problem for
    // serial threads but the reordering would propagate to nt.pntprocs
    // if the NetCon data pointers are also replaced by integer indices.

    // First, allocate the pointer array.
    int n_nc = 0;
    for (int ith = 0; ith < nrn_nthread; ++ith) {
        n_nc += nrn_threads[ith].n_netcon;
    }
    netcon_in_presyn_order_.resize(n_nc);
    n_nc = 0;

    // fill the indices with the offset values and reset the nc_cnt_
    // such that we use the nc_cnt_ in the following loop to assign the NetCon
    // to the right place
    // for PreSyn
    int offset = 0;
    for (int ith = 0; ith < nrn_nthread; ++ith) {
        NrnThread& nt = nrn_threads[ith];
        for (int i = 0; i < nt.n_presyn; ++i) {
            PreSyn& ps = nt.presyns[i];
            ps.nc_index_ = offset;
            offset += ps.nc_cnt_;
            ps.nc_cnt_ = 0;
        }
    }
    // for InputPreSyn
    for (auto psi: inputpresyn_) {
        psi->nc_index_ = offset;
        offset += psi->nc_cnt_;
        psi->nc_cnt_ = 0;
    }

    inputpresyn_.clear();

    // with gid to InputPreSyn and PreSyn maps we can setup the multisend
    // target lists.
    if (use_multisend_) {
#if NRN_MULTISEND
        nrn_multisend_setup();
#endif
    }

    // fill the netcon_in_presyn_order and recompute nc_cnt_
    // note that not all netcon_in_presyn will be filled if there are netcon
    // with no presyn (ie. nrnthreads_netcon_srcgid[nt.id][i] = -1) but that is ok since they are
    // only used via ps.nc_index_ and ps.nc_cnt_;
    for (int ith = 0; ith < nrn_nthread; ++ith) {
        NrnThread& nt = nrn_threads[ith];
        // if single thread or file transfer then definitely empty.
        std::vector<int>& negsrcgid_tid = nrnthreads_netcon_negsrcgid_tid[ith];
        size_t i_tid = 0;
        for (int i = 0; i < nt.n_netcon; ++i) {
            NetCon* nc = nt.netcons + i;
            int gid = nrnthreads_netcon_srcgid[ith][i];
            int tid = ith;
            if (!negsrcgid_tid.empty() && gid < -1) {
                tid = negsrcgid_tid[i_tid++];
            }
            PreSyn* ps;
            InputPreSyn* psi;
            netpar_tid_gid2ps(tid, gid, &ps, &psi);
            if (ps) {
                netcon_in_presyn_order_[ps->nc_index_ + ps->nc_cnt_] = nc;
                ++ps->nc_cnt_;
                ++n_nc;
            } else if (psi) {
                netcon_in_presyn_order_[psi->nc_index_ + psi->nc_cnt_] = nc;
                ++psi->nc_cnt_;
                ++n_nc;
            }
        }
    }

    /// Resize the vector to its actual size of the netcons put in it
    netcon_in_presyn_order_.resize(n_nc);
}

/// Clean up
void nrn_setup_cleanup() {
    for (int ith = 0; ith < nrn_nthread; ++ith) {
        if (nrnthreads_netcon_srcgid[ith])
            delete[] nrnthreads_netcon_srcgid[ith];
    }
    nrnthreads_netcon_srcgid.clear();
    nrnthreads_netcon_negsrcgid_tid.clear();
    neg_gid2out.clear();
}

void nrn_setup(const char* filesdat,
               bool is_mapping_needed,
               CheckPoints& checkPoints,
               bool run_setup_cleanup,
               const char* datpath,
               const char* restore_path,
               double* mindelay) {
    double time = nrn_wtime();

    int ngroup;
    int* gidgroups;
    nrn_read_filesdat(ngroup, gidgroups, filesdat);
    UserParams userParams(ngroup,
                          gidgroups,
                          datpath,
                          strlen(restore_path) == 0 ? datpath : restore_path,
                          checkPoints);


    // temporary bug work around. If any process has multiple threads, no
    // process can have a single thread. So, for now, if one thread, make two.
    // Fortunately, empty threads work fine.
    // Allocate NrnThread* nrn_threads of size ngroup (minimum 2)
    // Note that rank with 0 dataset/cellgroup works fine
    nrn_threads_create(userParams.ngroup <= 1 ? 2 : userParams.ngroup);

    // from nrn_has_net_event create pnttype2presyn for use in phase2.
    auto& memb_func = corenrn.get_memb_funcs();
    auto& pnttype2presyn = corenrn.get_pnttype2presyn();
    auto& nrn_has_net_event_ = corenrn.get_has_net_event();
    pnttype2presyn.clear();
    pnttype2presyn.resize(memb_func.size(), -1);
    for (size_t i = 0; i < nrn_has_net_event_.size(); ++i) {
        pnttype2presyn[nrn_has_net_event_[i]] = i;
    }

    nrnthread_chkpnt = new NrnThreadChkpnt[nrn_nthread];

    if (nrn_nthread > 1) {
        // NetCvode construction assumed one thread. Need nrn_nthread instances
        // of NetCvodeThreadData. Here since possible checkpoint restore of
        // tqueue at end of phase2.
        nrn_p_construct();
    }

    if (use_solve_interleave) {
        create_interleave_info();
    }

    /// Reserve vector of maps of size ngroup for negative gid-s
    /// std::vector< std::map<int, PreSyn*> > neg_gid2out;
    neg_gid2out.resize(userParams.ngroup);

    // bug fix. gid2out is cumulative over all threads and so do not
    // know how many there are til after phase1
    // A process's complete set of output gids and allocation of each thread's
    // nt.presyns and nt.netcons arrays.
    // Generates the gid2out map which is needed
    // to later count the required number of InputPreSyn
    /// gid2out - map of output presyn-s
    /// std::map<int, PreSyn*> gid2out;
    gid2out.clear();

    nrnthreads_netcon_srcgid.resize(nrn_nthread);
    for (int i = 0; i < nrn_nthread; ++i)
        nrnthreads_netcon_srcgid[i] = nullptr;

    // Gap junctions used to be done first in the sense of reading files
    // and calling gap_mpi_setup. But during phase2, gap_thread_setup and
    // gap_indices_permute were called after NrnThread.data was in its final
    // layout and mechanism permutation was determined. This is no longer
    // ideal as it necessitates keeping setup_info_ in existence to the end
    // of phase2.  So gap junction setup is deferred to after phase2.

    nrnthreads_netcon_negsrcgid_tid.resize(nrn_nthread);
    if (!corenrn_embedded) {
        coreneuron::phase_wrapper<coreneuron::phase::one>(userParams);
    } else {
        nrn_multithread_job([](NrnThread* n) {
            Phase1 p1{n->id};
            NrnThread& nt = *n;
            p1.populate(nt, mut);
        });
    }

    // from the gid2out map and the nrnthreads_netcon_srcgid array,
    // fill the gid2in, and from the number of entries,
    // allocate the process wide InputPreSyn array
    determine_inputpresyn();

    // read the rest of the gidgroup's data and complete the setup for each
    // thread.
    /* nrn_multithread_job supports serial, pthread, and openmp. */
    coreneuron::phase_wrapper<coreneuron::phase::two>(userParams, corenrn_embedded);

    // gap junctions
    // Gaps are done after phase2, in order to use layout and permutation
    // information via calls to stdindex2ptr.
    if (nrn_have_gaps) {
        nrn_partrans::transfer_thread_data_ = new nrn_partrans::TransferThreadData[nrn_nthread];
        if (!corenrn_embedded) {
            nrn_partrans::setup_info_ = new SetupTransferInfo[nrn_nthread];
            coreneuron::phase_wrapper<coreneuron::gap>(userParams);
        } else {
            nrn_partrans::setup_info_ = (*nrn2core_get_partrans_setup_info_)(userParams.ngroup,
                                                                             nrn_nthread,
                                                                             sizeof(sgid_t));
        }

        nrn_multithread_job(nrn_partrans::gap_data_indices_setup);
        nrn_partrans::gap_mpi_setup(userParams.ngroup);

        // Whether allocated in NEURON or here, delete here.
        delete[] nrn_partrans::setup_info_;
        nrn_partrans::setup_info_ = nullptr;
    }

    if (is_mapping_needed)
        coreneuron::phase_wrapper<coreneuron::phase::three>(userParams);

    *mindelay = set_mindelay(*mindelay);

    if (run_setup_cleanup)  // if run_setup_cleanup==false, user must call nrn_setup_cleanup() later
        nrn_setup_cleanup();

#if INTERLEAVE_DEBUG
    // mk_cell_indices debug code is supposed to be used with cell-per-core permutations
    if (corenrn_param.cell_interleave_permute == 1) {
        mk_cell_indices();
    }
#endif

    /// Allocate memory for fast_imem calculation
    nrn_fast_imem_alloc();

    /// Generally, tables depend on a few parameters. And if those parameters change,
    /// then the table needs to be recomputed. This is obviously important in NEURON
    /// since the user can change those parameters at any time. However, there is no
    /// c example for CoreNEURON so can't see what it looks like in that context.
    /// Boils down to setting up a function pointer of the function _check_table_thread(),
    /// which is only executed by StochKV.c.
    nrn_mk_table_check();  // was done in nrn_thread_memblist_setup in multicore.c

    size_t model_size_bytes;

    if (corenrn_param.model_stats) {
        write_mech_report();
        model_size_bytes = model_size(true);
    } else {
        model_size_bytes = model_size(false);
    }

    if (nrnmpi_myid == 0 && !corenrn_param.is_quiet()) {
        printf(" Setup Done   : %.2lf seconds \n", nrn_wtime() - time);

        if (model_size_bytes < 1024) {
            printf(" Model size   : %ld bytes\n", model_size_bytes);
        } else if (model_size_bytes < 1024 * 1024) {
            printf(" Model size   : %.2lf kB\n", model_size_bytes / 1024.);
        } else if (model_size_bytes < 1024 * 1024 * 1024) {
            printf(" Model size   : %.2lf MB\n", model_size_bytes / (1024. * 1024.));
        } else {
            printf(" Model size   : %.2lf GB\n", model_size_bytes / (1024. * 1024. * 1024.));
        }
    }

    delete[] userParams.gidgroups;
}

void setup_ThreadData(NrnThread& nt) {
    for (NrnThreadMembList* tml = nt.tml; tml; tml = tml->next) {
        Memb_func& mf = corenrn.get_memb_func(tml->index);
        Memb_list* ml = tml->ml;
        if (mf.thread_size_) {
            ml->_thread = (ThreadDatum*) ecalloc_align(mf.thread_size_, sizeof(ThreadDatum));
            if (mf.thread_mem_init_) {
                {
                    const std::lock_guard<OMP_Mutex> lock(mut);
                    (*mf.thread_mem_init_)(ml->_thread);
                }
            }
        } else {
            ml->_thread = nullptr;
        }
    }
}

void read_phasegap(NrnThread& nt, UserParams& userParams) {
    auto& F = userParams.file_reader[nt.id];
    if (F.fail()) {
        return;
    }

    F.checkpoint(0);

    int sidt_size = F.read_int();
    assert(sidt_size == int(sizeof(sgid_t)));
    std::size_t ntar = F.read_int();
    std::size_t nsrc = F.read_int();

    auto& si = nrn_partrans::setup_info_[nt.id];
    si.src_sid.resize(nsrc);
    si.src_type.resize(nsrc);
    si.src_index.resize(nsrc);
    if (nsrc) {
        F.read_array<sgid_t>(si.src_sid.data(), nsrc);
        F.read_array<int>(si.src_type.data(), nsrc);
        F.read_array<int>(si.src_index.data(), nsrc);
    }

    si.tar_sid.resize(ntar);
    si.tar_type.resize(ntar);
    si.tar_index.resize(ntar);
    if (ntar) {
        F.read_array<sgid_t>(si.tar_sid.data(), ntar);
        F.read_array<int>(si.tar_type.data(), ntar);
        F.read_array<int>(si.tar_index.data(), ntar);
    }

#if CORENRN_DEBUG
    printf("%d read_phasegap tid=%d nsrc=%d ntar=%d\n", nrnmpi_myid, nt.id, nsrc, ntar);
    for (int i = 0; i < nsrc; ++i) {
        printf("src %z %d %d\n", size_t(si.src_sid[i]), si.src_type[i], si.src_index[i]);
    }
    for (int i = 0; i < ntar; ++i) {
        printf("tar %z %d %d\n", size_t(si.src_sid[i]), si.src_type[i], si.src_index[i]);
    }
#endif
}

// This function is related to nrn_dblpntr2nrncore in Neuron to determine which values should
// be transferred from CoreNeuron. Types correspond to the value to be transferred based on
// mech_type enum or non-artificial cell mechanisms.
// take into account alignment, layout, permutation
// only voltage, i_membrane_ or mechanism data index allowed. (mtype 0 means time)
double* stdindex2ptr(int mtype, int index, NrnThread& nt) {
    if (mtype == voltage) {  // voltage
        int ix{index};       // relative to _actual_v
        nrn_assert((ix >= 0) && (ix < nt.end));
        if (nt._permute) {
            node_permute(&ix, 1, nt._permute);
        }
        return nt._actual_v + ix;
    } else if (mtype == i_membrane_) {  // membrane current from fast_imem calculation
        int ix{index};                  // relative to nrn_fast_imem->nrn_sav_rhs
        nrn_assert((ix >= 0) && (ix < nt.end));
        if (nt._permute) {
            node_permute(&ix, 1, nt._permute);
        }
        return nt.nrn_fast_imem->nrn_sav_rhs + ix;
    } else if (mtype > 0 && mtype < static_cast<int>(corenrn.get_memb_funcs().size())) {  //
        Memb_list* ml = nt._ml_list[mtype];
        nrn_assert(ml);
        int ix = nrn_param_layout(index, mtype, ml);
        if (ml->_permute) {
            ix = nrn_index_permute(ix, mtype, ml);
        }
        return ml->data + ix;
    } else if (mtype == 0) {  // time
        return &nt._t;
    } else {
        printf("stdindex2ptr does not handle mtype=%d\n", mtype);
        nrn_assert(0);
    }
    return nullptr;
}

// from i to (icnt, isz)
void nrn_inverse_i_layout(int i, int& icnt, int cnt, int& isz, int sz, int layout) {
    if (layout == Layout::AoS) {
        icnt = i / sz;
        isz = i % sz;
    } else if (layout == Layout::SoA) {
        int padded_cnt = nrn_soa_padded_size(cnt, layout);
        icnt = i % padded_cnt;
        isz = i / padded_cnt;
    } else {
        assert(0);
    }
}

/**
 * Cleanup global ion map created during mechanism registration
 *
 * In case of coreneuron standalone execution nrn_ion_global_map
 * can be deleted at the end of execution. But in case embedded
 * run via neuron, mechanisms are registered only once i.e. during
 * first call to coreneuron. This is why we call cleanup only in
 * case of standalone coreneuron execution via nrniv-core or
 * special-core.
 *
 * @todo coreneuron should have finalise callback which can be
 * called from NEURON for final memory cleanup including global
 * state like registered mechanisms and ions map.
 */
void nrn_cleanup_ion_map() {
    for (int i = 0; i < nrn_ion_global_map_size; i++) {
        free_memory(nrn_ion_global_map[i]);
    }
    free_memory(nrn_ion_global_map);
    nrn_ion_global_map = nullptr;
    nrn_ion_global_map_size = 0;
}

void delete_fornetcon_info(NrnThread& nt) {
    delete[] std::exchange(nt._fornetcon_perm_indices, nullptr);
    delete[] std::exchange(nt._fornetcon_weight_perm, nullptr);
}

/* nrn_threads_free() presumes all NrnThread and NrnThreadMembList data is
 * allocated with malloc(). This is not the case here, so let's try and fix
 * things up first. */

void nrn_cleanup() {
    clear_event_queue();  // delete left-over TQItem
    for (auto psi: gid2in) {
        delete psi.second;
    }
    gid2in.clear();
    gid2out.clear();

    // clean nrnthread_chkpnt
    if (nrnthread_chkpnt) {
        delete[] nrnthread_chkpnt;
        nrnthread_chkpnt = nullptr;
    }

    // clean NrnThreads
    for (int it = 0; it < nrn_nthread; ++it) {
        NrnThread* nt = nrn_threads + it;
        NrnThreadMembList* next_tml = nullptr;
        delete_fornetcon_info(*nt);
        delete_trajectory_requests(*nt);
        for (NrnThreadMembList* tml = nt->tml; tml; tml = next_tml) {
            Memb_list* ml = tml->ml;

            mod_f_t s = corenrn.get_memb_func(tml->index).destructor;
            if (s) {
                (*s)(nt, ml, tml->index);
            }

            ml->data = nullptr;  // this was pointing into memory owned by nt
            free_memory(ml->pdata);
            ml->pdata = nullptr;
            free_memory(ml->nodeindices);
            ml->nodeindices = nullptr;
            if (ml->_permute) {
                delete[] ml->_permute;
                ml->_permute = nullptr;
            }

            if (ml->_thread) {
                free_memory(ml->_thread);
                ml->_thread = nullptr;
            }

            // Destroy the global variables struct allocated in nrn_init
            if (auto* const priv_dtor = corenrn.get_memb_func(tml->index).private_destructor) {
                (*priv_dtor)(nt, ml, tml->index);
                assert(!ml->instance);
                assert(!ml->global_variables);
                assert(ml->global_variables_size == 0);
            }

            NetReceiveBuffer_t* nrb = ml->_net_receive_buffer;
            if (nrb) {
                if (nrb->_size) {
                    free_memory(nrb->_pnt_index);
                    free_memory(nrb->_weight_index);
                    free_memory(nrb->_nrb_t);
                    free_memory(nrb->_nrb_flag);
                    free_memory(nrb->_displ);
                    free_memory(nrb->_nrb_index);
                }
                free_memory(nrb);
                ml->_net_receive_buffer = nullptr;
            }

            NetSendBuffer_t* nsb = ml->_net_send_buffer;
            if (nsb) {
                delete nsb;
                ml->_net_send_buffer = nullptr;
            }

            if (tml->dependencies)
                free(tml->dependencies);

            next_tml = tml->next;
            free_memory(tml->ml);
            free_memory(tml);
        }

        nt->_actual_rhs = nullptr;
        nt->_actual_d = nullptr;
        nt->_actual_a = nullptr;
        nt->_actual_b = nullptr;

        free_memory(nt->_v_parent_index);
        nt->_v_parent_index = nullptr;

        free_memory(nt->_data);
        nt->_data = nullptr;

        free(nt->_idata);
        nt->_idata = nullptr;

        free_memory(nt->_vdata);
        nt->_vdata = nullptr;

        if (nt->_permute) {
            delete[] nt->_permute;
            nt->_permute = nullptr;
        }

        if (nt->presyns_helper) {
            free_memory(nt->presyns_helper);
            nt->presyns_helper = nullptr;
        }

        if (nt->pntprocs) {
            free_memory(nt->pntprocs);
            nt->pntprocs = nullptr;
        }

        if (nt->presyns) {
            delete[] nt->presyns;
            nt->presyns = nullptr;
        }

        if (nt->pnt2presyn_ix) {
            for (size_t i = 0; i < corenrn.get_has_net_event().size(); ++i) {
                if (nt->pnt2presyn_ix[i]) {
                    free(nt->pnt2presyn_ix[i]);
                }
            }
            free_memory(nt->pnt2presyn_ix);
        }

        if (nt->netcons) {
            delete[] nt->netcons;
            nt->netcons = nullptr;
        }

        if (nt->weights) {
            free_memory(nt->weights);
            nt->weights = nullptr;
        }

        if (nt->_shadow_rhs) {
            free_memory(nt->_shadow_rhs);
            nt->_shadow_rhs = nullptr;
        }

        if (nt->_shadow_d) {
            free_memory(nt->_shadow_d);
            nt->_shadow_d = nullptr;
        }

        if (nt->_net_send_buffer_size) {
            free_memory(nt->_net_send_buffer);
            nt->_net_send_buffer = nullptr;
            nt->_net_send_buffer_size = 0;
        }

        if (nt->_watch_types) {
            free(nt->_watch_types);
            nt->_watch_types = nullptr;
        }

        // mapping information is available only for non-empty NrnThread
        if (nt->mapping && nt->ncell) {
            delete ((NrnThreadMappingInfo*) nt->mapping);
        }

        free_memory(nt->_ml_list);

        if (nt->nrn_fast_imem) {
            fast_imem_free();
        }
    }

#if NRN_MULTISEND
    nrn_multisend_cleanup();
#endif

    netcon_in_presyn_order_.clear();

    nrn_threads_free();

    if (!corenrn.get_pnttype2presyn().empty()) {
        corenrn.get_pnttype2presyn().clear();
    }

    destroy_interleave_info();

    nrn_partrans::gap_cleanup();
}

void delete_trajectory_requests(NrnThread& nt) {
    if (nt.trajec_requests) {
        TrajectoryRequests* tr = nt.trajec_requests;
        if (tr->n_trajec) {
            delete[] tr->vpr;
            if (tr->scatter) {
                delete[] tr->scatter;
            }
            if (tr->varrays) {
                delete[] tr->varrays;
            }
            delete[] tr->gather;
        }
        delete nt.trajec_requests;
        nt.trajec_requests = nullptr;
    }
}

void read_phase1(NrnThread& nt, UserParams& userParams) {
    Phase1 p1{userParams.file_reader[nt.id]};

    // Protect gid2in, gid2out and neg_gid2out
    p1.populate(nt, mut);
}

void read_phase2(NrnThread& nt, UserParams& userParams) {
    Phase2 p2;
    if (corenrn_embedded) {
        p2.read_direct(nt.id, nt);
    } else {
        p2.read_file(userParams.file_reader[nt.id], nt);
    }
    p2.populate(nt, userParams);
}

/** read mapping information for neurons */
void read_phase3(NrnThread& nt, UserParams& userParams) {
    /** restore checkpoint state (before restoring queue items */
    auto& F = userParams.file_reader[nt.id];
    F.restore_checkpoint();

    /** mapping information for all neurons in single NrnThread */
    NrnThreadMappingInfo* ntmapping = new NrnThreadMappingInfo();

    int count = 0;

    F.read_mapping_cell_count(&count);

    /** number of cells in mapping file should equal to cells in NrnThread */
    nrn_assert(count == nt.ncell);

    /** for every neuron */
    for (int i = 0; i < nt.ncell; i++) {
        int gid, nsec, nseg, nseclist;

        // read counts
        F.read_mapping_count(&gid, &nsec, &nseg, &nseclist);

        CellMapping* cmap = new CellMapping(gid);

        // read section-segment mapping for every section list
        for (int j = 0; j < nseclist; j++) {
            SecMapping* smap = new SecMapping();
            F.read_mapping_info(smap);
            cmap->add_sec_map(smap);
        }

        ntmapping->add_cell_mapping(cmap);
    }

    // make number #cells match with mapping size
    nrn_assert((int) ntmapping->size() == nt.ncell);

    // set pointer in NrnThread
    nt.mapping = (void*) ntmapping;
    nt.summation_report_handler_ = std::make_unique<SummationReportMapping>();
}

/* Returns the size of the dynamically allocated memory for NrnThreadMembList
 * Includes:
 *  - Size of NrnThreadMembList
 *  - Size of Memb_list
 *  - Size of nodeindices
 *  - Size of _permute
 *  - Size of _thread
 *  - Size of NetReceive and NetSend Buffers
 *  - Size of int variables
 *  - Size of double variables (If include_data is enabled. Those variables are already counted
 * since they point to nt->_data.)
 */
size_t memb_list_size(NrnThreadMembList* tml, bool include_data) {
    size_t nbyte = sizeof(NrnThreadMembList) + sizeof(Memb_list);
    nbyte += tml->ml->nodecount * sizeof(int);
    if (tml->ml->_permute) {
        nbyte += tml->ml->nodecount * sizeof(int);
    }
    if (tml->ml->_thread) {
        Memb_func& mf = corenrn.get_memb_func(tml->index);
        nbyte += mf.thread_size_ * sizeof(ThreadDatum);
    }
    if (tml->ml->_net_receive_buffer) {
        nbyte += sizeof(NetReceiveBuffer_t) + tml->ml->_net_receive_buffer->size_of_object();
    }
    if (tml->ml->_net_send_buffer) {
        nbyte += sizeof(NetSendBuffer_t) + tml->ml->_net_send_buffer->size_of_object();
    }
    if (include_data) {
        nbyte += corenrn.get_prop_param_size()[tml->index] * tml->ml->nodecount * sizeof(double);
    }
    nbyte += corenrn.get_prop_dparam_size()[tml->index] * tml->ml->nodecount * sizeof(Datum);
#ifdef DEBUG
    int i = tml->index;
    printf("%s %d psize=%d ppsize=%d cnt=%d nbyte=%ld\n",
           corenrn.get_memb_func(i).sym,
           i,
           corenrn.get_prop_param_size()[i],
           corenrn.get_prop_dparam_size()[i],
           tml->ml->nodecount,
           nbyte);
#endif
    return nbyte;
}

/// Approximate count of number of bytes for the gid2out map
size_t output_presyn_size(void) {
    if (gid2out.empty()) {
        return 0;
    }
    size_t nbyte = sizeof(gid2out) + sizeof(int) * gid2out.size() +
                   sizeof(PreSyn*) * gid2out.size();
#ifdef DEBUG
    printf(" gid2out table bytes=~%ld size=%ld\n", nbyte, gid2out.size());
#endif
    return nbyte;
}

size_t input_presyn_size(void) {
    if (gid2in.empty()) {
        return 0;
    }
    size_t nbyte = sizeof(gid2in) + sizeof(int) * gid2in.size() +
                   sizeof(InputPreSyn*) * gid2in.size();
#ifdef DEBUG
    printf(" gid2in table bytes=~%ld size=%ld\n", nbyte, gid2in.size());
#endif
    return nbyte;
}

size_t model_size(bool detailed_report) {
    long nbyte = 0;
    size_t sz_nrnThread = sizeof(NrnThread);
    size_t sz_presyn = sizeof(PreSyn);
    size_t sz_input_presyn = sizeof(InputPreSyn);
    size_t sz_netcon = sizeof(NetCon);
    size_t sz_pntproc = sizeof(Point_process);
    size_t nccnt = 0;

    std::vector<long> size_data(13, 0);
    std::vector<long> global_size_data_min(13, 0);
    std::vector<long> global_size_data_max(13, 0);
    std::vector<long> global_size_data_sum(13, 0);
    std::vector<float> global_size_data_avg(13, 0.0);

    for (int i = 0; i < nrn_nthread; ++i) {
        NrnThread& nt = nrn_threads[i];
        size_t nb_nt = 0;  // per thread
        nccnt += nt.n_netcon;

        // Memb_list size
        int nmech = 0;
        for (auto tml = nt.tml; tml; tml = tml->next) {
            nb_nt += memb_list_size(tml, false);
            ++nmech;
        }

        // basic thread size includes mechanism data and G*V=I matrix
        nb_nt += sz_nrnThread;
        nb_nt += nt._ndata * sizeof(double) + nt._nidata * sizeof(int) + nt._nvdata * sizeof(void*);
        nb_nt += nt.end * sizeof(int);  // _v_parent_index

        // network connectivity
        nb_nt += nt.n_pntproc * sz_pntproc + nt.n_netcon * sz_netcon + nt.n_presyn * sz_presyn +
                 nt.n_input_presyn * sz_input_presyn + nt.n_weight * sizeof(double);
        nbyte += nb_nt;

#ifdef DEBUG
        printf("ncell=%d end=%d nmech=%d\n", nt.ncell, nt.end, nmech);
        printf("ndata=%ld nidata=%ld nvdata=%ld\n", nt._ndata, nt._nidata, nt._nvdata);
        printf("nbyte so far %ld\n", nb_nt);
        printf("n_presyn = %d sz=%ld nbyte=%ld\n", nt.n_presyn, sz_presyn, nt.n_presyn * sz_presyn);
        printf("n_input_presyn = %d sz=%ld nbyte=%ld\n",
               nt.n_input_presyn,
               sz_input_presyn,
               nt.n_input_presyn * sz_input_presyn);
        printf("n_pntproc=%d sz=%ld nbyte=%ld\n",
               nt.n_pntproc,
               sz_pntproc,
               nt.n_pntproc * sz_pntproc);
        printf("n_netcon=%d sz=%ld nbyte=%ld\n", nt.n_netcon, sz_netcon, nt.n_netcon * sz_netcon);
        printf("n_weight = %d\n", nt.n_weight);

        printf("%d thread %d total bytes %ld\n", nrnmpi_myid, i, nb_nt);
#endif

        if (detailed_report) {
            size_data[0] += nt.ncell;
            size_data[1] += nt.end;
            size_data[2] += nmech;
            size_data[3] += nt._ndata;
            size_data[4] += nt._nidata;
            size_data[5] += nt._nvdata;
            size_data[6] += nt.n_presyn;
            size_data[7] += nt.n_input_presyn;
            size_data[8] += nt.n_pntproc;
            size_data[9] += nt.n_netcon;
            size_data[10] += nt.n_weight;
            size_data[11] += nb_nt;
        }
    }

    nbyte += nccnt * sizeof(NetCon*);
    nbyte += output_presyn_size();
    nbyte += input_presyn_size();

    nbyte += nrnran123_instance_count() * nrnran123_state_size();

#ifdef DEBUG
    printf("%d netcon pointers %ld  nbyte=%ld\n", nrnmpi_myid, nccnt, nccnt * sizeof(NetCon*));
    printf("nrnran123 size=%ld cnt=%ld nbyte=%ld\n",
           nrnran123_state_size(),
           nrnran123_instance_count(),
           nrnran123_instance_count() * nrnran123_state_size());
    printf("%d total bytes %ld\n", nrnmpi_myid, nbyte);
#endif
    if (detailed_report) {
        size_data[12] = nbyte;
#if NRNMPI
        if (corenrn_param.mpi_enable) {
            // last arg is op type where 1 is sum, 2 is max and any other value is min
            nrnmpi_long_allreduce_vec(&size_data[0], &global_size_data_sum[0], 13, 1);
            nrnmpi_long_allreduce_vec(&size_data[0], &global_size_data_max[0], 13, 2);
            nrnmpi_long_allreduce_vec(&size_data[0], &global_size_data_min[0], 13, 3);
            for (int i = 0; i < 13; i++) {
                global_size_data_avg[i] = global_size_data_sum[i] / float(nrnmpi_numprocs);
            }
        } else
#endif
        {
            global_size_data_max = size_data;
            global_size_data_min = size_data;
            global_size_data_avg.assign(size_data.cbegin(), size_data.cend());
        }
        // now print the collected data:
        if (nrnmpi_myid == 0) {
            printf("Memory size information for all NrnThreads per rank\n");
            printf("------------------------------------------------------------------\n");
            printf("%22s %12s %12s %12s\n", "field", "min", "max", "avg");
            printf("%22s %12ld %12ld %15.2f\n",
                   "n_cell",
                   global_size_data_min[0],
                   global_size_data_max[0],
                   global_size_data_avg[0]);
            printf("%22s %12ld %12ld %15.2f\n",
                   "n_compartment",
                   global_size_data_min[1],
                   global_size_data_max[1],
                   global_size_data_avg[1]);
            printf("%22s %12ld %12ld %15.2f\n",
                   "n_mechanism",
                   global_size_data_min[2],
                   global_size_data_max[2],
                   global_size_data_avg[2]);
            printf("%22s %12ld %12ld %15.2f\n",
                   "_ndata",
                   global_size_data_min[3],
                   global_size_data_max[3],
                   global_size_data_avg[3]);
            printf("%22s %12ld %12ld %15.2f\n",
                   "_nidata",
                   global_size_data_min[4],
                   global_size_data_max[4],
                   global_size_data_avg[4]);
            printf("%22s %12ld %12ld %15.2f\n",
                   "_nvdata",
                   global_size_data_min[5],
                   global_size_data_max[5],
                   global_size_data_avg[5]);
            printf("%22s %12ld %12ld %15.2f\n",
                   "n_presyn",
                   global_size_data_min[6],
                   global_size_data_max[6],
                   global_size_data_avg[6]);
            printf("%22s %12ld %12ld %15.2f\n",
                   "n_presyn (bytes)",
                   global_size_data_min[6] * sz_presyn,
                   global_size_data_max[6] * sz_presyn,
                   global_size_data_avg[6] * sz_presyn);
            printf("%22s %12ld %12ld %15.2f\n",
                   "n_input_presyn",
                   global_size_data_min[7],
                   global_size_data_max[7],
                   global_size_data_avg[7]);
            printf("%22s %12ld %12ld %15.2f\n",
                   "n_input_presyn (bytes)",
                   global_size_data_min[7] * sz_input_presyn,
                   global_size_data_max[7] * sz_input_presyn,
                   global_size_data_avg[7] * sz_input_presyn);
            printf("%22s %12ld %12ld %15.2f\n",
                   "n_pntproc",
                   global_size_data_min[8],
                   global_size_data_max[8],
                   global_size_data_avg[8]);
            printf("%22s %12ld %12ld %15.2f\n",
                   "n_pntproc (bytes)",
                   global_size_data_min[8] * sz_pntproc,
                   global_size_data_max[8] * sz_pntproc,
                   global_size_data_avg[8] * sz_pntproc);
            printf("%22s %12ld %12ld %15.2f\n",
                   "n_netcon",
                   global_size_data_min[9],
                   global_size_data_max[9],
                   global_size_data_avg[9]);
            printf("%22s %12ld %12ld %15.2f\n",
                   "n_netcon (bytes)",
                   global_size_data_min[9] * sz_netcon,
                   global_size_data_max[9] * sz_netcon,
                   global_size_data_avg[9] * sz_netcon);
            printf("%22s %12ld %12ld %15.2f\n",
                   "n_weight",
                   global_size_data_min[10],
                   global_size_data_max[10],
                   global_size_data_avg[10]);
            printf("%22s %12ld %12ld %15.2f\n",
                   "NrnThread (bytes)",
                   global_size_data_min[11],
                   global_size_data_max[11],
                   global_size_data_avg[11]);
            printf("%22s %12ld %12ld %15.2f\n",
                   "model size (bytes)",
                   global_size_data_min[12],
                   global_size_data_max[12],
                   global_size_data_avg[12]);
        }
    }

#if NRNMPI
    if (corenrn_param.mpi_enable) {
        long global_nbyte = 0;
        nrnmpi_long_allreduce_vec(&nbyte, &global_nbyte, 1, 1);
        nbyte = global_nbyte;
    }
#endif

    return nbyte;
}

}  // namespace coreneuron


================================================
FILE: coreneuron/io/nrn_setup.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#pragma once

#include <string>
#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/io/nrn_filehandler.hpp"
#include "coreneuron/io/nrn2core_direct.h"
#include "coreneuron/io/user_params.hpp"
#include "coreneuron/io/mem_layout_util.hpp"
#include "coreneuron/io/nrn_checkpoint.hpp"

namespace coreneuron {
void read_phase1(NrnThread& nt, UserParams& userParams);
void read_phase2(NrnThread& nt, UserParams& userParams);
void read_phase3(NrnThread& nt, UserParams& userParams);
void read_phasegap(NrnThread& nt, UserParams& userParams);
void setup_ThreadData(NrnThread& nt);

void nrn_setup(const char* filesdat,
               bool is_mapping_needed,
               CheckPoints& checkPoints,
               bool run_setup_cleanup = true,
               const char* datapath = "",
               const char* restore_path = "",
               double* mindelay = nullptr);

// Functions to load and clean data;
extern void nrn_init_and_load_data(int argc,
                                   char** argv,
                                   CheckPoints& checkPoints,
                                   bool is_mapping_needed = false,
                                   bool run_setup_cleanup = true);
extern void allocate_data_in_mechanism_nrn_init();
extern void nrn_setup_cleanup();

extern int nrn_i_layout(int i, int cnt, int j, int size, int layout);

size_t memb_list_size(NrnThreadMembList* tml, bool include_data);

size_t model_size(bool detailed_report);

namespace coreneuron {


/// Reading phase number.
enum phase { one = 1, two, three, gap };

/// Get the phase number in form of the string.
template <phase P>
inline std::string getPhaseName();

template <>
inline std::string getPhaseName<one>() {
    return "1";
}

template <>
inline std::string getPhaseName<two>() {
    return "2";
}

template <>
inline std::string getPhaseName<three>() {
    return "3";
}

template <>
inline std::string getPhaseName<gap>() {
    return "gap";
}

/// Reading phase selector.
template <phase P>
inline void read_phase_aux(NrnThread& nt, UserParams&);

template <>
inline void read_phase_aux<one>(NrnThread& nt, UserParams& userParams) {
    read_phase1(nt, userParams);
}

template <>
inline void read_phase_aux<two>(NrnThread& nt, UserParams& userParams) {
    read_phase2(nt, userParams);
}

template <>
inline void read_phase_aux<three>(NrnThread& nt, UserParams& userParams) {
    read_phase3(nt, userParams);
}

template <>
inline void read_phase_aux<gap>(NrnThread& nt, UserParams& userParams) {
    read_phasegap(nt, userParams);
}

/// Reading phase wrapper for each neuron group.
template <phase P>
inline void* phase_wrapper_w(NrnThread* nt, UserParams& userParams, bool in_memory_transfer) {
    int i = nt->id;
    if (i < userParams.ngroup) {
        if (!in_memory_transfer) {
            const char* data_dir = userParams.path;
            // directory to read could be different for phase 2 if we are restoring
            // all other phases still read from dataset directory because the data
            // is constant
            if (P == 2) {
                data_dir = userParams.restore_path;
            }

            std::string fname = std::string(data_dir) + "/" +
                                std::to_string(userParams.gidgroups[i]) + "_" + getPhaseName<P>() +
                                ".dat";

            // Avoid trying to open the gid_gap.dat file if it doesn't exist when there are no
            // gap junctions in this gid.
            // Note that we still need to close `userParams.file_reader[i]`
            // because files are opened in the order of `gid_1.dat`, `gid_2.dat` and `gid_gap.dat`.
            // When we open next file, `gid_gap.dat` in this case, we are supposed to close the
            // handle for `gid_2.dat` even though file doesn't exist.
            if (P == gap && !FileHandler::file_exist(fname)) {
                userParams.file_reader[i].close();
            } else {
                // if no file failed to open or not opened at all
                userParams.file_reader[i].open(fname);
            }
        }
        read_phase_aux<P>(*nt, userParams);
        if (!in_memory_transfer) {
            userParams.file_reader[i].close();
        }
        if (P == 2) {
            setup_ThreadData(*nt);
        }
    }
    return nullptr;
}

/// Specific phase reading executed by threads.
template <phase P>
inline static void phase_wrapper(UserParams& userParams, int direct = 0) {
    nrn_multithread_job(phase_wrapper_w<P>, userParams, direct != 0);
}
}  // namespace coreneuron
}  // namespace coreneuron


================================================
FILE: coreneuron/io/nrnsection_mapping.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#pragma once

#include <numeric>
#include <string>
#include <utility>
#include <vector>
#include <map>
#include <iostream>

namespace coreneuron {

/** type to store every section and associated segments */
using segvec_type = std::vector<int>;
using secseg_map_type = std::map<int, segvec_type>;
using secseg_it_type = secseg_map_type::iterator;

/** @brief Section to segment mapping
 *
 *  For a section list (of a particulat type), store mapping
 *  of section to segments
 *  a section is a arbitrary user classification to recognize some segments (ex: api, soma, dend,
 * axon)
 *
 */
struct SecMapping {
    /** name of section list */
    std::string name;

    /** map of section and associated segments */
    secseg_map_type secmap;

    SecMapping() = default;

    explicit SecMapping(std::string s)
        : name(std::move(s)) {}

    /** @brief return total number of sections in section list */
    size_t num_sections() const noexcept {
        return secmap.size();
    }

    /** @brief return number of segments in section list */
    size_t num_segments() const {
        return std::accumulate(secmap.begin(), secmap.end(), 0, [](int psum, const auto& item) {
            return psum + item.second.size();
        });
    }

    /** @brief add section to associated segment */
    void add_segment(int sec, int seg) {
        secmap[sec].push_back(seg);
    }
};

/** @brief Compartment mapping information for a cell
 *
 * A cell can have multiple section list types like
 * soma, axon, apic, dend etc. User will add these
 * section lists using HOC interface.
 */
struct CellMapping {
    /** gid of a cell */
    int gid;

    /** list of section lists (like soma, axon, apic) */
    std::vector<SecMapping*> secmapvec;

    CellMapping(int g)
        : gid(g) {}

    /** @brief total number of sections in a cell */
    int num_sections() const {
        return std::accumulate(secmapvec.begin(),
                               secmapvec.end(),
                               0,
                               [](int psum, const auto& secmap) {
                                   return psum + secmap->num_sections();
                               });
    }

    /** @brief return number of segments in a cell */
    int num_segments() const {
        return std::accumulate(secmapvec.begin(),
                               secmapvec.end(),
                               0,
                               [](int psum, const auto& secmap) {
                                   return psum + secmap->num_segments();
                               });
    }

    /** @brief number of section lists */
    size_t size() const noexcept {
        return secmapvec.size();
    }

    /** @brief add new SecMapping */
    void add_sec_map(SecMapping* s) {
        secmapvec.push_back(s);
    }

    /** @brief return section list mapping with given name */
    SecMapping* get_seclist_mapping(const std::string& name) const {
        for (auto& secmap: secmapvec) {
            if (name == secmap->name) {
                return secmap;
            }
        }

        std::cout << "Warning: Section mapping list " << name << " doesn't exist! \n";
        return nullptr;
    }

    /** @brief return segment count for specific section list with given name */
    size_t get_seclist_segment_count(const std::string& name) const {
        SecMapping* s = get_seclist_mapping(name);
        size_t count = 0;
        if (s) {
            count = s->num_segments();
        }
        return count;
    }
    /** @brief return segment count for specific section list with given name */
    size_t get_seclist_section_count(const std::string& name) const {
        SecMapping* s = get_seclist_mapping(name);
        size_t count = 0;
        if (s) {
            count = s->num_sections();
        }
        return count;
    }

    ~CellMapping() {
        for (size_t i = 0; i < secmapvec.size(); i++) {
            delete secmapvec[i];
        }
    }
};

/** @brief Compartment mapping information for NrnThread
 *
 * NrnThread could have more than one cell in cellgroup
 * and we store this in vector.
 */
struct NrnThreadMappingInfo {
    /** list of cells mapping */
    std::vector<CellMapping*> mappingvec;

    /** @brief number of cells */
    size_t size() const {
        return mappingvec.size();
    }

    /** @brief memory cleanup */
    ~NrnThreadMappingInfo() {
        for (size_t i = 0; i < mappingvec.size(); i++) {
            delete mappingvec[i];
        }
    }

    /** @brief get cell mapping information for given gid
     *	if exist otherwise return nullptr.
     */
    CellMapping* get_cell_mapping(int gid) const {
        for (const auto& mapping: mappingvec) {
            if (mapping->gid == gid) {
                return mapping;
            }
        }
        return nullptr;
    }

    /** @brief add mapping information of new cell */
    void add_cell_mapping(CellMapping* c) {
        mappingvec.push_back(c);
    }
};
}  // namespace coreneuron


================================================
FILE: coreneuron/io/output_spikes.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#include <iostream>
#include <sstream>
#include <cstring>
#include <stdexcept>  // std::lenght_error
#include <vector>
#include <algorithm>
#include <numeric>
#include <limits>

#include "coreneuron/nrnconf.h"
#include "coreneuron/io/nrn2core_direct.h"
#include "coreneuron/io/output_spikes.hpp"
#include "coreneuron/mpi/nrnmpi.h"
#include "coreneuron/mpi/core/nrnmpi.hpp"
#include "coreneuron/utils/nrnmutdec.hpp"
#include "coreneuron/mpi/nrnmpidec.h"
#include "coreneuron/utils/string_utils.h"
#include "coreneuron/apps/corenrn_parameters.hpp"
#ifdef ENABLE_SONATA_REPORTS
#include "bbp/sonata/reports.h"
#endif  // ENABLE_SONATA_REPORTS

/**
 * @brief Return all spike vectors to NEURON
 *
 * @param spiketvec - vector of spikes at the end of CORENEURON simulation
 * @param spikegidvec - vector of gids at the end of CORENEURON simulation
 * @return true if we are in embedded_run and NEURON has successfully retrieved the vectors
 */
static bool all_spikes_return(std::vector<double>& spiketvec, std::vector<int>& spikegidvec) {
    return corenrn_embedded && nrn2core_all_spike_vectors_return_ &&
           (*nrn2core_all_spike_vectors_return_)(spiketvec, spikegidvec);
}

namespace coreneuron {
/// --> Coreneuron as SpikeBuffer class
std::vector<double> spikevec_time;
std::vector<int> spikevec_gid;

static OMP_Mutex mut;

void mk_spikevec_buffer(int sz) {
    try {
        spikevec_time.reserve(sz);
        spikevec_gid.reserve(sz);
    } catch (const std::length_error& le) {
        std::cerr << "Lenght error" << le.what() << std::endl;
    }
}

void spikevec_lock() {
    mut.lock();
}

void spikevec_unlock() {
    mut.unlock();
}

static void local_spikevec_sort(std::vector<double>& isvect,
                                std::vector<int>& isvecg,
                                std::vector<double>& osvect,
                                std::vector<int>& osvecg) {
    osvect.resize(isvect.size());
    osvecg.resize(isvecg.size());
    // first build a permutation vector
    std::vector<std::size_t> perm(isvect.size());
    std::iota(perm.begin(), perm.end(), 0);
    // sort by gid (second predicate first)
    std::stable_sort(perm.begin(), perm.end(), [&](std::size_t i, std::size_t j) {
        return isvecg[i] < isvecg[j];
    });
    // then sort by time
    std::stable_sort(perm.begin(), perm.end(), [&](std::size_t i, std::size_t j) {
        return isvect[i] < isvect[j];
    });
    // now apply permutation to time and gid output vectors
    std::transform(perm.begin(), perm.end(), osvect.begin(), [&](std::size_t i) {
        return isvect[i];
    });
    std::transform(perm.begin(), perm.end(), osvecg.begin(), [&](std::size_t i) {
        return isvecg[i];
    });
}

#if NRNMPI

static void sort_spikes(std::vector<double>& spikevec_time, std::vector<int>& spikevec_gid) {
    double lmin_time = std::numeric_limits<double>::max();
    double lmax_time = std::numeric_limits<double>::min();
    if (!spikevec_time.empty()) {
        lmin_time = *(std::min_element(spikevec_time.begin(), spikevec_time.end()));
        lmax_time = *(std::max_element(spikevec_time.begin(), spikevec_time.end()));
    }
    double min_time = nrnmpi_dbl_allmin(lmin_time);
    double max_time = nrnmpi_dbl_allmax(lmax_time);

    // allocate send and receive counts and displacements for MPI_Alltoallv
    std::vector<int> snd_cnts(nrnmpi_numprocs);
    std::vector<int> rcv_cnts(nrnmpi_numprocs);
    std::vector<int> snd_dsps(nrnmpi_numprocs);
    std::vector<int> rcv_dsps(nrnmpi_numprocs);

    double bin_t = (max_time - min_time) / nrnmpi_numprocs;
    bin_t = bin_t ? bin_t : 1;
    // first find number of spikes in each time window
    for (const auto& st: spikevec_time) {
        int idx = (int) (st - min_time) / bin_t;
        snd_cnts[idx]++;
    }
    for (int i = 1; i < nrnmpi_numprocs; i++) {
        snd_dsps[i] = snd_dsps[i - 1] + snd_cnts[i - 1];
    }

    // now let each rank know how many spikes they will receive
    // and get in turn all the buffer sizes to receive
    nrnmpi_int_alltoall(&snd_cnts[0], &rcv_cnts[0], 1);
    for (int i = 1; i < nrnmpi_numprocs; i++) {
        rcv_dsps[i] = rcv_dsps[i - 1] + rcv_cnts[i - 1];
    }
    std::size_t new_sz = 0;
    for (const auto& r: rcv_cnts) {
        new_sz += r;
    }
    // prepare new sorted vectors
    std::vector<double> svt_buf(new_sz, 0.0);
    std::vector<int> svg_buf(new_sz, 0);

    // now exchange data
    nrnmpi_dbl_alltoallv(spikevec_time.data(),
                         &snd_cnts[0],
                         &snd_dsps[0],
                         svt_buf.data(),
                         &rcv_cnts[0],
                         &rcv_dsps[0]);
    nrnmpi_int_alltoallv(spikevec_gid.data(),
                         &snd_cnts[0],
                         &snd_dsps[0],
                         svg_buf.data(),
                         &rcv_cnts[0],
                         &rcv_dsps[0]);

    local_spikevec_sort(svt_buf, svg_buf, spikevec_time, spikevec_gid);
}

#ifdef ENABLE_SONATA_REPORTS
/** Split spikevec_time and spikevec_gid by populations
 *  Add spike data with population name and gid offset tolibsonatareport API
 */
void output_spike_populations(const SpikesInfo& spikes_info) {
    // Write spikes with default population name and offset
    if (spikes_info.population_info.empty()) {
        sonata_add_spikes_population("All",
                                     0,
                                     spikevec_time.data(),
                                     spikevec_time.size(),
                                     spikevec_gid.data(),
                                     spikevec_gid.size());
        return;
    }
    int n_populations = spikes_info.population_info.size();
    for (int idx = 0; idx < n_populations; idx++) {
        const auto& curr_pop = spikes_info.population_info[idx];
        std::string population_name = curr_pop.first;
        int population_offset = curr_pop.second;
        int gid_lower = population_offset;
        int gid_upper = std::numeric_limits<int>::max();
        if (idx != n_populations - 1) {
            gid_upper = spikes_info.population_info[idx + 1].second - 1;
        }
        std::vector<double> pop_spikevec_time;
        std::vector<int> pop_spikevec_gid;
        for (int j = 0; j < spikevec_gid.size(); j++) {
            if (spikevec_gid[j] >= gid_lower && spikevec_gid[j] <= gid_upper) {
                pop_spikevec_time.push_back(spikevec_time[j]);
                pop_spikevec_gid.push_back(spikevec_gid[j]);
            }
        }
        sonata_add_spikes_population(population_name.data(),
                                     population_offset,
                                     pop_spikevec_time.data(),
                                     pop_spikevec_time.size(),
                                     pop_spikevec_gid.data(),
                                     pop_spikevec_gid.size());
    }
}
#endif  // ENABLE_SONATA_REPORTS

/** Write generated spikes to out.dat using mpi parallel i/o.
 *  \todo : MPI related code should be factored into nrnmpi.c
 *          Check spike record length which is set to 64 chars
 */
static void output_spikes_parallel(const char* outpath, const SpikesInfo& spikes_info) {
    std::stringstream ss;
    ss << outpath << "/out.dat";
    std::string fname = ss.str();

    // remove if file already exist
    if (nrnmpi_myid == 0) {
        remove(fname.c_str());
    }
#ifdef ENABLE_SONATA_REPORTS
    sonata_create_spikefile(outpath, spikes_info.file_name.data());
    output_spike_populations(spikes_info);
    sonata_write_spike_populations();
    sonata_close_spikefile();
#endif  // ENABLE_SONATA_REPORTS

    sort_spikes(spikevec_time, spikevec_gid);
    nrnmpi_barrier();

    // each spike record in the file is time + gid (64 chars sufficient)
    const int SPIKE_RECORD_LEN = 64;
    size_t num_spikes = spikevec_gid.size();
    size_t num_bytes = (sizeof(char) * num_spikes * SPIKE_RECORD_LEN);
    char* spike_data = (char*) malloc(num_bytes);

    if (spike_data == nullptr) {
        printf("Error while writing spikes due to memory allocation\n");
        return;
    }

    // empty if no spikes
    strcpy(spike_data, "");

    // populate buffer with all spike entries
    char spike_entry[SPIKE_RECORD_LEN];
    size_t spike_data_offset = 0;
    for (size_t i = 0; i < num_spikes; i++) {
        int spike_entry_chars =
            snprintf(spike_entry, 64, "%.8g\t%d\n", spikevec_time[i], spikevec_gid[i]);
        spike_data_offset =
            strcat_at_pos(spike_data, spike_data_offset, spike_entry, spike_entry_chars);
    }

    // calculate offset into global file. note that we don't write
    // all num_bytes but only "populated" buffer
    size_t num_chars = strlen(spike_data);

    nrnmpi_write_file(fname, spike_data, num_chars);

    free(spike_data);
}
#endif

static void output_spikes_serial(const char* outpath) {
    std::stringstream ss;
    ss << outpath << "/out.dat";
    std::string fname = ss.str();

    // reserve some space for sorted spikevec buffers
    std::vector<double> sorted_spikevec_time(spikevec_time.size());
    std::vector<int> sorted_spikevec_gid(spikevec_gid.size());
    local_spikevec_sort(spikevec_time, spikevec_gid, sorted_spikevec_time, sorted_spikevec_gid);

    // remove if file already exist
    remove(fname.c_str());

    FILE* f = fopen(fname.c_str(), "w");
    if (!f && nrnmpi_myid == 0) {
        std::cout << "WARNING: Could not open file for writing spikes." << std::endl;
        return;
    }

    for (std::size_t i = 0; i < sorted_spikevec_gid.size(); ++i)
        if (sorted_spikevec_gid[i] > -1)
            fprintf(f, "%.8g\t%d\n", sorted_spikevec_time[i], sorted_spikevec_gid[i]);

    fclose(f);
}

void output_spikes(const char* outpath, const SpikesInfo& spikes_info) {
    // try to transfer spikes to NEURON. If successfull, don't write out.dat
    if (all_spikes_return(spikevec_time, spikevec_gid)) {
        clear_spike_vectors();
        return;
    }
#if NRNMPI
    if (corenrn_param.mpi_enable && nrnmpi_initialized()) {
        output_spikes_parallel(outpath, spikes_info);
    } else
#endif
    {
        output_spikes_serial(outpath);
    }
    clear_spike_vectors();
}

void clear_spike_vectors() {
    auto spikevec_time_capacity = spikevec_time.capacity();
    auto spikevec_gid_capacity = spikevec_gid.capacity();
    spikevec_time.clear();
    spikevec_gid.clear();
    spikevec_time.reserve(spikevec_time_capacity);
    spikevec_gid.reserve(spikevec_gid_capacity);
}

void validation(std::vector<std::pair<double, int>>& res) {
    for (unsigned i = 0; i < spikevec_gid.size(); ++i)
        if (spikevec_gid[i] > -1)
            res.push_back(std::make_pair(spikevec_time[i], spikevec_gid[i]));
}
}  // namespace coreneuron


================================================
FILE: coreneuron/io/output_spikes.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#pragma once

#include <string>
#include <vector>
#include <utility>
#include "coreneuron/io/reports/nrnreport.hpp"
namespace coreneuron {
void output_spikes(const char* outpath, const SpikesInfo& spikes_info);
void mk_spikevec_buffer(int);

extern std::vector<double> spikevec_time;
extern std::vector<int> spikevec_gid;

void clear_spike_vectors();
void validation(std::vector<std::pair<double, int>>& res);

void spikevec_lock();
void spikevec_unlock();
}  // namespace coreneuron


================================================
FILE: coreneuron/io/phase1.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#include <cassert>
#include <mutex>

#include "coreneuron/nrniv/nrniv_decl.h"
#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/io/phase1.hpp"
#include "coreneuron/utils/nrnoc_aux.hpp"

int (*nrn2core_get_dat1_)(int tid,
                          int& n_presyn,
                          int& n_netcon,
                          int*& output_gid,
                          int*& netcon_srcgid,
                          std::vector<int>& netcon_negsrcgid_tid);

namespace coreneuron {
Phase1::Phase1(FileHandler& F) {
    assert(!F.fail());
    int n_presyn = F.read_int();  /// Number of PreSyn-s in NrnThread nt
    int n_netcon = F.read_int();  /// Number of NetCon-s in NrnThread nt

    this->output_gids = F.read_vector<int>(n_presyn);
    this->netcon_srcgids = F.read_vector<int>(n_netcon);
    // For file mode transfer, it is not allowed that negative gids exist
    // in different threads. So this->netcon_tids remains clear.

    F.close();
}

Phase1::Phase1(int thread_id) {
    int* output_gids;
    int* netcon_srcgid;
    int n_presyn;
    int n_netcon;

    // TODO : check error codes for NEURON - CoreNEURON communication
    int valid = (*nrn2core_get_dat1_)(
        thread_id, n_presyn, n_netcon, output_gids, netcon_srcgid, this->netcon_negsrcgid_tid);
    if (!valid) {
        return;
    }

    this->output_gids = std::vector<int>(output_gids, output_gids + n_presyn);
    delete[] output_gids;
    this->netcon_srcgids = std::vector<int>(netcon_srcgid, netcon_srcgid + n_netcon);
    delete[] netcon_srcgid;
}

void Phase1::populate(NrnThread& nt, OMP_Mutex& mut) {
    nt.n_presyn = this->output_gids.size();
    nt.n_netcon = this->netcon_srcgids.size();

    nrnthreads_netcon_srcgid[nt.id] = new int[nt.n_netcon];
    std::copy(this->netcon_srcgids.begin(),
              this->netcon_srcgids.end(),
              nrnthreads_netcon_srcgid[nt.id]);

    // netcon_negsrcgid_tid is empty if file transfer or single thread
    coreneuron::nrnthreads_netcon_negsrcgid_tid[nt.id] = this->netcon_negsrcgid_tid;

    nt.netcons = new NetCon[nt.n_netcon];

    if (nt.n_presyn) {
        nt.presyns_helper = (PreSynHelper*) ecalloc_align(nt.n_presyn, sizeof(PreSynHelper));
        nt.presyns = new PreSyn[nt.n_presyn];
    }

    PreSyn* ps = nt.presyns;
    /// go through all presyns
    for (auto& gid: this->output_gids) {
        if (gid == -1) {
            ++ps;
            continue;
        }

        {
            const std::lock_guard<OMP_Mutex> lock(mut);
            // Note that the negative (type, index)
            // coded information goes into the neg_gid2out[tid] hash table.
            // See netpar.cpp for the netpar_tid_... function implementations.
            // Both that table and the process wide gid2out table can be deleted
            // before the end of setup

            /// Put gid into the gid2out hash table with correspondent output PreSyn
            /// Or to the negative PreSyn map
            if (gid >= 0) {
                char m[200];
                if (gid2in.find(gid) != gid2in.end()) {
                    sprintf(m, "gid=%d already exists as an input port", gid);
                    hoc_execerror(m,
                                  "Setup all the output ports on this process before using them as "
                                  "input ports.");
                }
                if (gid2out.find(gid) != gid2out.end()) {
                    sprintf(m, "gid=%d already exists on this process as an output port", gid);
                    hoc_execerror(m, 0);
                }
                ps->gid_ = gid;
                ps->output_index_ = gid;
                gid2out[gid] = ps;
            } else {
                nrn_assert(neg_gid2out[nt.id].find(gid) == neg_gid2out[nt.id].end());
                ps->output_index_ = -1;
                neg_gid2out[nt.id][gid] = ps;
            }
        }  // end of the mutex

        ++ps;
    }
}

}  // namespace coreneuron


================================================
FILE: coreneuron/io/phase1.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#pragma once

#include <vector>

#include "coreneuron/io/nrn_filehandler.hpp"
#include "coreneuron/utils/nrnmutdec.hpp"

namespace coreneuron {

struct NrnThread;

class Phase1 {
  public:
    Phase1(FileHandler& F);
    Phase1(int thread_id);
    void populate(NrnThread& nt, OMP_Mutex& mut);

  private:
    std::vector<int> output_gids;
    std::vector<int> netcon_srcgids;
    std::vector<int> netcon_negsrcgid_tid;  // entries only for negative srcgids
};

}  // namespace coreneuron


================================================
FILE: coreneuron/io/phase2.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#include "coreneuron/io/phase2.hpp"
#include "coreneuron/coreneuron.hpp"
#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/io/nrn_checkpoint.hpp"
#include "coreneuron/utils/nrnoc_aux.hpp"
#include "coreneuron/permute/cellorder.hpp"
#include "coreneuron/permute/data_layout.hpp"
#include "coreneuron/permute/node_permute.h"
#include "coreneuron/utils/utils.hpp"
#include "coreneuron/utils/vrecitem.h"
#include "coreneuron/io/mem_layout_util.hpp"
#include "coreneuron/io/setup_fornetcon.hpp"

#if defined(_OPENMP)
#include <omp.h>
#endif

int (*nrn2core_get_dat2_1_)(int tid,
                            int& n_real_cell,
                            int& ngid,
                            int& n_real_gid,
                            int& nnode,
                            int& ndiam,
                            int& nmech,
                            int*& tml_index,
                            int*& ml_nodecount,
                            int& nidata,
                            int& nvdata,
                            int& nweight);

int (*nrn2core_get_dat2_2_)(int tid,
                            int*& v_parent_index,
                            double*& a,
                            double*& b,
                            double*& area,
                            double*& v,
                            double*& diamvec);

int (*nrn2core_get_dat2_mech_)(int tid,
                               size_t i,
                               int dsz_inst,
                               int*& nodeindices,
                               double*& data,
                               int*& pdata,
                               std::vector<int>& pointer2type);

int (*nrn2core_get_dat2_3_)(int tid,
                            int nweight,
                            int*& output_vindex,
                            double*& output_threshold,
                            int*& netcon_pnttype,
                            int*& netcon_pntindex,
                            double*& weights,
                            double*& delays);

int (*nrn2core_get_dat2_corepointer_)(int tid, int& n);

int (*nrn2core_get_dat2_corepointer_mech_)(int tid,
                                           int type,
                                           int& icnt,
                                           int& dcnt,
                                           int*& iarray,
                                           double*& darray);

int (*nrn2core_get_dat2_vecplay_)(int tid, std::vector<int>& indices);

int (*nrn2core_get_dat2_vecplay_inst_)(int tid,
                                       int i,
                                       int& vptype,
                                       int& mtype,
                                       int& ix,
                                       int& sz,
                                       double*& yvec,
                                       double*& tvec,
                                       int& last_index,
                                       int& discon_index,
                                       int& ubound_index);

namespace coreneuron {
template <typename T>
inline void mech_data_layout_transform(T* data, int cnt, int sz, int layout) {
    if (layout == Layout::AoS) {
        return;
    }
    // layout is equal to Layout::SoA
    int align_cnt = nrn_soa_padded_size(cnt, layout);
    std::vector<T> d(cnt * sz);
    // copy matrix
    for (int i = 0; i < cnt; ++i) {
        for (int j = 0; j < sz; ++j) {
            d[i * sz + j] = data[i * sz + j];
        }
    }
    // transform memory layout
    for (int i = 0; i < cnt; ++i) {
        for (int j = 0; j < sz; ++j) {
            data[i + j * align_cnt] = d[i * sz + j];
        }
    }
}

void Phase2::read_file(FileHandler& F, const NrnThread& nt) {
    n_real_cell = F.read_int();
    n_output = F.read_int();
    n_real_output = F.read_int();
    n_node = F.read_int();
    n_diam = F.read_int();
    n_mech = F.read_int();
    mech_types = std::vector<int>(n_mech, 0);
    nodecounts = std::vector<int>(n_mech, 0);
    for (int i = 0; i < n_mech; ++i) {
        mech_types[i] = F.read_int();
        nodecounts[i] = F.read_int();
    }

    // check mechanism compatibility before reading data
    check_mechanism();

    n_idata = F.read_int();
    n_vdata = F.read_int();
    int n_weight = F.read_int();
    v_parent_index = (int*) ecalloc_align(n_node, sizeof(int));
    F.read_array<int>(v_parent_index, n_node);

    int n_data_padded = nrn_soa_padded_size(n_node, SOA_LAYOUT);
    {
        {  // Compute size of _data and allocate
            int n_data = 6 * n_data_padded;
            if (n_diam > 0) {
                n_data += n_data_padded;
            }
            for (int i = 0; i < n_mech; ++i) {
                int layout = corenrn.get_mech_data_layout()[mech_types[i]];
                int n = nodecounts[i];
                int sz = corenrn.get_prop_param_size()[mech_types[i]];
                n_data = nrn_soa_byte_align(n_data);
                n_data += nrn_soa_padded_size(n, layout) * sz;
            }
            _data = (double*) ecalloc_align(n_data, sizeof(double));
        }
        F.read_array<double>(_data + 2 * n_data_padded, n_node);
        F.read_array<double>(_data + 3 * n_data_padded, n_node);
        F.read_array<double>(_data + 5 * n_data_padded, n_node);
        F.read_array<double>(_data + 4 * n_data_padded, n_node);
        if (n_diam > 0) {
            F.read_array<double>(_data + 6 * n_data_padded, n_node);
        }
    }

    size_t offset = 6 * n_data_padded;
    if (n_diam > 0) {
        offset += n_data_padded;
    }
    for (int i = 0; i < n_mech; ++i) {
        int layout = corenrn.get_mech_data_layout()[mech_types[i]];
        int n = nodecounts[i];
        int sz = corenrn.get_prop_param_size()[mech_types[i]];
        int dsz = corenrn.get_prop_dparam_size()[mech_types[i]];
        offset = nrn_soa_byte_align(offset);
        std::vector<int> nodeindices;
        if (!corenrn.get_is_artificial()[mech_types[i]]) {
            nodeindices = F.read_vector<int>(n);
        }
        F.read_array<double>(_data + offset, sz * n);
        offset += nrn_soa_padded_size(n, layout) * sz;
        std::vector<int> pdata;
        if (dsz > 0) {
            pdata = F.read_vector<int>(dsz * n);
        }
        tmls.emplace_back(TML{nodeindices, pdata, mech_types[i], {}, {}});
        if (dsz > 0) {
            int sz = F.read_int();
            if (sz) {
                auto& p2t = tmls.back().pointer2type;
                p2t = F.read_vector<int>(sz);
            }
        }
    }
    output_vindex = F.read_vector<int>(nt.n_presyn);
    output_threshold = F.read_vector<double>(n_real_output);
    pnttype = F.read_vector<int>(nt.n_netcon);
    pntindex = F.read_vector<int>(nt.n_netcon);
    weights = F.read_vector<double>(n_weight);
    delay = F.read_vector<double>(nt.n_netcon);
    num_point_process = F.read_int();

    for (int i = 0; i < n_mech; ++i) {
        if (!corenrn.get_bbcore_read()[mech_types[i]]) {
            continue;
        }
        tmls[i].type = F.read_int();
        int icnt = F.read_int();
        int dcnt = F.read_int();
        if (icnt > 0) {
            tmls[i].iArray = F.read_vector<int>(icnt);
        }
        if (dcnt > 0) {
            tmls[i].dArray = F.read_vector<double>(dcnt);
        }
    }

    int n_vec_play_continuous = F.read_int();
    vec_play_continuous.reserve(n_vec_play_continuous);
    for (int i = 0; i < n_vec_play_continuous; ++i) {
        VecPlayContinuous_ item;
        item.vtype = F.read_int();
        item.mtype = F.read_int();
        item.ix = F.read_int();
        int sz = F.read_int();
        item.yvec = IvocVect(sz);
        item.tvec = IvocVect(sz);
        F.read_array<double>(item.yvec.data(), sz);
        F.read_array<double>(item.tvec.data(), sz);
        vec_play_continuous.push_back(std::move(item));
    }

    // store current checkpoint state to continue reading mapping
    // The checkpoint numbering in phase 3 is a continuing of phase 2, and so will be restored
    F.record_checkpoint();

    if (F.eof())
        return;

    nrn_assert(F.read_int() == n_vec_play_continuous);

    for (int i = 0; i < n_vec_play_continuous; ++i) {
        auto& vecPlay = vec_play_continuous[i];
        vecPlay.last_index = F.read_int();
        vecPlay.discon_index = F.read_int();
        vecPlay.ubound_index = F.read_int();
    }

    patstim_index = F.read_int();

    nrn_assert(F.read_int() == -1);

    for (int i = 0; i < nt.n_presyn; ++i) {
        preSynConditionEventFlags.push_back(F.read_int());
    }

    nrn_assert(F.read_int() == -1);
    restore_events(F);

    nrn_assert(F.read_int() == -1);
    restore_events(F);
}

void Phase2::read_direct(int thread_id, const NrnThread& nt) {
    int* types_ = nullptr;
    int* nodecounts_ = nullptr;
    int n_weight;
    (*nrn2core_get_dat2_1_)(thread_id,
                            n_real_cell,
                            n_output,
                            n_real_output,
                            n_node,
                            n_diam,
                            n_mech,
                            types_,
                            nodecounts_,
                            n_idata,
                            n_vdata,
                            n_weight);
    mech_types = std::vector<int>(types_, types_ + n_mech);
    delete[] types_;

    nodecounts = std::vector<int>(nodecounts_, nodecounts_ + n_mech);
    delete[] nodecounts_;

    check_mechanism();

    // TODO: fix it in the future
    int n_data_padded = nrn_soa_padded_size(n_node, SOA_LAYOUT);
    int n_data = 6 * n_data_padded;
    if (n_diam > 0) {
        n_data += n_data_padded;
    }
    for (int i = 0; i < n_mech; ++i) {
        int layout = corenrn.get_mech_data_layout()[mech_types[i]];
        int n = nodecounts[i];
        int sz = corenrn.get_prop_param_size()[mech_types[i]];
        n_data = nrn_soa_byte_align(n_data);
        n_data += nrn_soa_padded_size(n, layout) * sz;
    }
    _data = (double*) ecalloc_align(n_data, sizeof(double));

    v_parent_index = (int*) ecalloc_align(n_node, sizeof(int));
    double* actual_a = _data + 2 * n_data_padded;
    double* actual_b = _data + 3 * n_data_padded;
    double* actual_v = _data + 4 * n_data_padded;
    double* actual_area = _data + 5 * n_data_padded;
    double* actual_diam = n_diam > 0 ? _data + 6 * n_data_padded : nullptr;
    (*nrn2core_get_dat2_2_)(
        thread_id, v_parent_index, actual_a, actual_b, actual_area, actual_v, actual_diam);

    tmls.resize(n_mech);

    auto& param_sizes = corenrn.get_prop_param_size();
    auto& dparam_sizes = corenrn.get_prop_dparam_size();
    int dsz_inst = 0;
    size_t offset = 6 * n_data_padded;
    if (n_diam > 0)
        offset += n_data_padded;
    for (int i = 0; i < n_mech; ++i) {
        auto& tml = tmls[i];
        int type = mech_types[i];
        int layout = corenrn.get_mech_data_layout()[type];
        offset = nrn_soa_byte_align(offset);

        tml.type = type;
        // artificial cell don't use nodeindices
        if (!corenrn.get_is_artificial()[type]) {
            tml.nodeindices.resize(nodecounts[i]);
        }
        tml.pdata.resize(nodecounts[i] * dparam_sizes[type]);

        int* nodeindices_ = nullptr;
        double* data_ = _data + offset;
        int* pdata_ = const_cast<int*>(tml.pdata.data());
        (*nrn2core_get_dat2_mech_)(thread_id,
                                   i,
                                   dparam_sizes[type] > 0 ? dsz_inst : 0,
                                   nodeindices_,
                                   data_,
                                   pdata_,
                                   tml.pointer2type);
        if (dparam_sizes[type] > 0)
            dsz_inst++;
        offset += nrn_soa_padded_size(nodecounts[i], layout) * param_sizes[type];
        if (nodeindices_) {
            std::copy(nodeindices_, nodeindices_ + nodecounts[i], tml.nodeindices.data());
            free(nodeindices_);  // not free_memory because this is allocated by NEURON?
        }
        if (corenrn.get_is_artificial()[type]) {
            assert(nodeindices_ == nullptr);
        }
    }

    int* output_vindex_ = nullptr;
    double* output_threshold_ = nullptr;
    int* pnttype_ = nullptr;
    int* pntindex_ = nullptr;
    double* weight_ = nullptr;
    double* delay_ = nullptr;
    (*nrn2core_get_dat2_3_)(thread_id,
                            n_weight,
                            output_vindex_,
                            output_threshold_,
                            pnttype_,
                            pntindex_,
                            weight_,
                            delay_);

    output_vindex = std::vector<int>(output_vindex_, output_vindex_ + nt.n_presyn);
    delete[] output_vindex_;

    output_threshold = std::vector<double>(output_threshold_, output_threshold_ + n_real_output);
    delete[] output_threshold_;

    int n_netcon = nt.n_netcon;
    pnttype = std::vector<int>(pnttype_, pnttype_ + n_netcon);
    delete[] pnttype_;

    pntindex = std::vector<int>(pntindex_, pntindex_ + n_netcon);
    delete[] pntindex_;

    weights = std::vector<double>(weight_, weight_ + n_weight);
    delete[] weight_;

    delay = std::vector<double>(delay_, delay_ + n_netcon);
    delete[] delay_;

    (*nrn2core_get_dat2_corepointer_)(nt.id, num_point_process);

    for (int i = 0; i < n_mech; ++i) {
        // not all mod files have BBCOREPOINTER data to read
        if (!corenrn.get_bbcore_read()[mech_types[i]]) {
            continue;
        }
        int icnt;
        int* iArray_ = nullptr;
        int dcnt;
        double* dArray_ = nullptr;
        (*nrn2core_get_dat2_corepointer_mech_)(nt.id, tmls[i].type, icnt, dcnt, iArray_, dArray_);
        tmls[i].iArray.resize(icnt);
        std::copy(iArray_, iArray_ + icnt, tmls[i].iArray.begin());
        delete[] iArray_;

        tmls[i].dArray.resize(dcnt);
        std::copy(dArray_, dArray_ + dcnt, tmls[i].dArray.begin());
        delete[] dArray_;
    }

    // Get from NEURON, the VecPlayContinuous indices in
    // NetCvode::fixed_play_ for this thread.
    std::vector<int> indices_vec_play_continuous;
    (*nrn2core_get_dat2_vecplay_)(thread_id, indices_vec_play_continuous);

    // i is an index into NEURON's NetCvode::fixed_play_ for this thread.
    for (auto i: indices_vec_play_continuous) {
        VecPlayContinuous_ item;
        // yvec_ and tvec_ are not deleted as that space is within
        // NEURON Vector
        double *yvec_, *tvec_;
        int sz;
        (*nrn2core_get_dat2_vecplay_inst_)(thread_id,
                                           i,
                                           item.vtype,
                                           item.mtype,
                                           item.ix,
                                           sz,
                                           yvec_,
                                           tvec_,
                                           item.last_index,
                                           item.discon_index,
                                           item.ubound_index);
        item.yvec = IvocVect(sz);
        item.tvec = IvocVect(sz);
        std::copy(yvec_, yvec_ + sz, item.yvec.data());
        std::copy(tvec_, tvec_ + sz, item.tvec.data());
        vec_play_continuous.push_back(std::move(item));
    }
}

/// Check if MOD file used between NEURON and CoreNEURON is same
void Phase2::check_mechanism() {
    int diff_mech_count = 0;
    for (int i = 0; i < n_mech; ++i) {
        if (std::any_of(corenrn.get_different_mechanism_type().begin(),
                        corenrn.get_different_mechanism_type().end(),
                        [&](int e) { return e == mech_types[i]; })) {
            if (nrnmpi_myid == 0) {
                printf("Error: %s is a different MOD file than used by NEURON!\n",
                       nrn_get_mechname(mech_types[i]));
            }
            diff_mech_count++;
        }
    }

    if (diff_mech_count > 0) {
        if (nrnmpi_myid == 0) {
            printf(
                "Error : NEURON and CoreNEURON must use same mod files for compatibility, %d "
                "different mod file(s) found. Re-compile special and special-core!\n",
                diff_mech_count);
            nrn_abort(1);
        }
    }
}

/// Perform in memory transformation between AoS<>SoA for integer data
void Phase2::transform_int_data(int elem0,
                                int nodecount,
                                int* pdata,
                                int i,
                                int dparam_size,
                                int layout,
                                int n_node_) {
    for (int iml = 0; iml < nodecount; ++iml) {
        int* pd = pdata + nrn_i_layout(iml, nodecount, i, dparam_size, layout);
        int ix = *pd;  // relative to beginning of _actual_*
        nrn_assert((ix >= 0) && (ix < n_node_));
        *pd = elem0 + ix;  // relative to nt._data
    }
}

void Phase2::set_net_send_buffer(Memb_list** ml_list, const std::vector<int>& pnt_offset) {
    // NetReceiveBuffering
    for (auto& net_buf_receive: corenrn.get_net_buf_receive()) {
        int type = net_buf_receive.second;
        // Does this thread have this type.
        Memb_list* ml = ml_list[type];
        if (ml) {  // needs a NetReceiveBuffer
            NetReceiveBuffer_t* nrb =
                (NetReceiveBuffer_t*) ecalloc_align(1, sizeof(NetReceiveBuffer_t));
            assert(!ml->_net_receive_buffer);
            ml->_net_receive_buffer = nrb;
            nrb->_pnt_offset = pnt_offset[type];

            // begin with a size equal to the number of instances, or at least 8
            nrb->_size = std::max(8, ml->nodecount);
            nrb->_pnt_index = (int*) ecalloc_align(nrb->_size, sizeof(int));
            nrb->_displ = (int*) ecalloc_align(nrb->_size + 1, sizeof(int));
            nrb->_nrb_index = (int*) ecalloc_align(nrb->_size, sizeof(int));
            nrb->_weight_index = (int*) ecalloc_align(nrb->_size, sizeof(int));
            nrb->_nrb_t = (double*) ecalloc_align(nrb->_size, sizeof(double));
            nrb->_nrb_flag = (double*) ecalloc_align(nrb->_size, sizeof(double));
        }
    }

    // NetSendBuffering
    for (int type: corenrn.get_net_buf_send_type()) {
        // Does this thread have this type.
        Memb_list* ml = ml_list[type];
        if (ml) {  // needs a NetSendBuffer
            assert(!ml->_net_send_buffer);
            // begin with a size equal to twice number of instances
            NetSendBuffer_t* nsb = new NetSendBuffer_t(ml->nodecount * 2);
            ml->_net_send_buffer = nsb;
        }
    }
}

void Phase2::restore_events(FileHandler& F) {
    int type;
    while ((type = F.read_int()) != 0) {
        double time;
        F.read_array(&time, 1);
        switch (type) {
            case NetConType: {
                auto event = std::make_shared<NetConType_>();
                event->time = time;
                event->netcon_index = F.read_int();
                events.emplace_back(type, event);
                break;
            }
            case SelfEventType: {
                auto event = std::make_shared<SelfEventType_>();
                event->time = time;
                event->target_type = F.read_int();
                event->point_proc_instance = F.read_int();
                event->target_instance = F.read_int();
                F.read_array(&event->flag, 1);
                event->movable = F.read_int();
                event->weight_index = F.read_int();
                events.emplace_back(type, event);
                break;
            }
            case PreSynType: {
                auto event = std::make_shared<PreSynType_>();
                event->time = time;
                event->presyn_index = F.read_int();
                events.emplace_back(type, event);
                break;
            }
            case NetParEventType: {
                auto event = std::make_shared<NetParEvent_>();
                event->time = time;
                events.emplace_back(type, event);
                break;
            }
            case PlayRecordEventType: {
                auto event = std::make_shared<PlayRecordEventType_>();
                event->time = time;
                event->play_record_type = F.read_int();
                if (event->play_record_type == VecPlayContinuousType) {
                    event->vecplay_index = F.read_int();
                    events.emplace_back(type, event);
                } else {
                    nrn_assert(0);
                }
                break;
            }
            default: {
                nrn_assert(0);
                break;
            }
        }
    }
}

void Phase2::fill_before_after_lists(NrnThread& nt, const std::vector<Memb_func>& memb_func) {
    /// Fill the BA lists
    std::vector<BAMech*> before_after_map(memb_func.size());
    for (int i = 0; i < BEFORE_AFTER_SIZE; ++i) {
        for (size_t ii = 0; ii < memb_func.size(); ++ii) {
            before_after_map[ii] = nullptr;
        }
        // Save first before-after block only. In case of multiple before-after blocks with the
        // same mech type, we will get subsequent ones using linked list below.
        for (auto bam = corenrn.get_bamech()[i]; bam; bam = bam->next) {
            if (!before_after_map[bam->type]) {
                before_after_map[bam->type] = bam;
            }
        }
        // necessary to keep in order wrt multiple BAMech with same mech type
        NrnThreadBAList** ptbl = nt.tbl + i;
        for (auto tml = nt.tml; tml; tml = tml->next) {
            if (before_after_map[tml->index]) {
                int mtype = tml->index;
                for (auto bam = before_after_map[mtype]; bam && bam->type == mtype;
                     bam = bam->next) {
                    auto tbl = (NrnThreadBAList*) emalloc(sizeof(NrnThreadBAList));
                    *ptbl = tbl;
                    tbl->next = nullptr;
                    tbl->bam = bam;
                    tbl->ml = tml->ml;
                    ptbl = &(tbl->next);
                }
            }
        }
    }
}

void Phase2::pdata_relocation(const NrnThread& nt, const std::vector<Memb_func>& memb_func) {
    // Some pdata may index into data which has been reordered from AoS to
    // SoA. The four possibilities are if semantics is -1 (area), -5 (pointer),
    // -9 (diam), // or 0-999 (ion variables).
    // Note that pdata has a layout and the // type block in nt.data into which
    // it indexes, has a layout.

    // For faster search of tmls[i].type == type, use a map.
    // (perhaps would be better to replace tmls so that we can use tmls[type].
    std::map<int, size_t> type2itml;
    for (size_t i = 0; i < tmls.size(); ++i) {
        if (tmls[i].pointer2type.size()) {
            type2itml[tmls[i].type] = i;
        }
    }

    for (auto tml = nt.tml; tml; tml = tml->next) {
        int type = tml->index;
        int layout = corenrn.get_mech_data_layout()[type];
        int* pdata = tml->ml->pdata;
        int cnt = tml->ml->nodecount;
        int szdp = corenrn.get_prop_dparam_size()[type];
        int* semantics = memb_func[type].dparam_semantics;

        // compute only for ARTIFICIAL_CELL (has useful area pointer with semantics=-1)
        if (!corenrn.get_is_artificial()[type]) {
            if (szdp) {
                if (!semantics)
                    continue;  // temporary for HDFReport, Binreport which will be skipped in
                // bbcore_write of HBPNeuron
                nrn_assert(semantics);
            }

            for (int i = 0; i < szdp; ++i) {
                int s = semantics[i];
                switch (s) {
                    case -1:  // area
                        transform_int_data(
                            nt._actual_area - nt._data, cnt, pdata, i, szdp, layout, nt.end);
                        break;
                    case -9:  // diam
                        transform_int_data(
                            nt._actual_diam - nt._data, cnt, pdata, i, szdp, layout, nt.end);
                        break;
                    case -5:  // pointer assumes a pointer to membrane voltage
                        // or mechanism data in this thread. The value of the
                        // pointer on the NEURON side was analyzed by
                        // nrn_dblpntr2nrncore which returned the
                        // mechanism index and type. At this moment the index
                        // is in pdata and the type is in tmls[type].pointer2type.
                        // However the latter order is according to the nested
                        // iteration for nodecount { for szdp {}}
                        // Also the nodecount POINTER instances of mechanism
                        // might possibly point to differnt range variables.
                        // Therefore it is not possible to use transform_int_data
                        // and the transform must be done one at a time.
                        // So we do nothing here and separately iterate
                        // after this loop instead of the former voltage only
                        /**
                        transform_int_data(
                            nt._actual_v - nt._data, cnt, pdata, i, szdp, layout, nt.end);
                         **/
                        break;
                    default:
                        if (s >= 0 && s < 1000) {  // ion
                            int etype = s;
                            /* if ion is SoA, must recalculate pdata values */
                            /* if ion is AoS, have to deal with offset */
                            Memb_list* eml = nt._ml_list[etype];
                            int edata0 = eml->data - nt._data;
                            int ecnt = eml->nodecount;
                            int esz = corenrn.get_prop_param_size()[etype];
                            for (int iml = 0; iml < cnt; ++iml) {
                                int* pd = pdata + nrn_i_layout(iml, cnt, i, szdp, layout);
                                int ix = *pd;  // relative to the ion data
                                nrn_assert((ix >= 0) && (ix < ecnt * esz));
                                /* Original pd order assumed ecnt groups of esz */
                                *pd = edata0 + nrn_param_layout(ix, etype, eml);
                            }
                        }
                }
            }
            // Handle case -5 POINTER transformation (see comment above)
            auto search = type2itml.find(type);
            if (search != type2itml.end()) {
                auto& ptypes = tmls[type2itml[type]].pointer2type;
                assert(ptypes.size());
                size_t iptype = 0;
                for (int iml = 0; iml < cnt; ++iml) {
                    for (int i = 0; i < szdp; ++i) {
                        if (semantics[i] == -5) {  // POINTER
                            int* pd = pdata + nrn_i_layout(iml, cnt, i, szdp, layout);
                            int ix = *pd;  // relative to elem0
                            int ptype = ptypes[iptype++];
                            if (ptype == voltage) {
                                nrn_assert((ix >= 0) && (ix < nt.end));
                                int elem0 = nt._actual_v - nt._data;
                                *pd = elem0 + ix;
                            } else {
                                Memb_list* pml = nt._ml_list[ptype];
                                int pcnt = pml->nodecount;
                                int psz = corenrn.get_prop_param_size()[ptype];
                                nrn_assert((ix >= 0) && (ix < pcnt * psz));
                                int elem0 = pml->data - nt._data;
                                *pd = elem0 + nrn_param_layout(ix, ptype, pml);
                            }
                        }
                    }
                }
                ptypes.clear();
            }
        }
    }
}

void Phase2::set_dependencies(const NrnThread& nt, const std::vector<Memb_func>& memb_func) {
    /* here we setup the mechanism dependencies. if there is a mechanism dependency
     * then we allocate an array for tml->dependencies otherwise set it to nullptr.
     * In order to find out the "real" dependencies i.e. dependent mechanism
     * exist at the same compartment, we compare the nodeindices of mechanisms
     * returned by nrn_mech_depend.
     */

    /* temporary array for dependencies */
    int* mech_deps = (int*) ecalloc(memb_func.size(), sizeof(int));

    for (auto tml = nt.tml; tml; tml = tml->next) {
        /* initialize to null */
        tml->dependencies = nullptr;
        tml->ndependencies = 0;

        /* get dependencies from the models */
        int deps_cnt = nrn_mech_depend(tml->index, mech_deps);

        /* if dependencies, setup dependency array */
        if (deps_cnt) {
            /* store "real" dependencies in the vector */
            std::vector<int> actual_mech_deps;

            Memb_list* ml = tml->ml;
            int* nodeindices = ml->nodeindices;

            /* iterate over dependencies */
            for (int j = 0; j < deps_cnt; j++) {
                /* memb_list of dependency mechanism */
                Memb_list* dml = nt._ml_list[mech_deps[j]];

                /* dependency mechanism may not exist in the model */
                if (!dml)
                    continue;

                /* take nodeindices for comparison */
                int* dnodeindices = dml->nodeindices;

                /* set_intersection function needs temp vector to push the common values */
                std::vector<int> node_intersection;

                /* make sure they have non-zero nodes and find their intersection */
                if ((ml->nodecount > 0) && (dml->nodecount > 0)) {
                    std::set_intersection(nodeindices,
                                          nodeindices + ml->nodecount,
                                          dnodeindices,
                                          dnodeindices + dml->nodecount,
                                          std::back_inserter(node_intersection));
                }

                /* if they intersect in the nodeindices, it's real dependency */
                if (!node_intersection.empty()) {
                    actual_mech_deps.push_back(mech_deps[j]);
                }
            }

            /* copy actual_mech_deps to dependencies */
            if (!actual_mech_deps.empty()) {
                tml->ndependencies = actual_mech_deps.size();
                tml->dependencies = (int*) ecalloc(actual_mech_deps.size(), sizeof(int));
                std::copy(actual_mech_deps.begin(), actual_mech_deps.end(), tml->dependencies);
            }
        }
    }

    /* free temp dependency array */
    free(mech_deps);
}

void Phase2::handle_weights(NrnThread& nt, int n_netcon, NrnThreadChkpnt& ntc) {
    nt.n_weight = weights.size();
    // weights in netcons order in groups defined by Point_process target type.
    nt.weights = (double*) ecalloc_align(nt.n_weight, sizeof(double));
    std::copy(weights.begin(), weights.end(), nt.weights);

    int iw = 0;
    for (int i = 0; i < n_netcon; ++i) {
        NetCon& nc = nt.netcons[i];
        nc.u.weight_index_ = iw;
        if (pnttype[i] != 0) {
            iw += corenrn.get_pnt_receive_size()[pnttype[i]];
        } else {
            iw += 1;
        }
    }
    assert(iw == nt.n_weight);

    // Nontrivial if FOR_NETCON in use by some mechanisms
    setup_fornetcon_info(nt);


#if CHKPNTDEBUG
    ntc.delay = new double[n_netcon];
    memcpy(ntc.delay, delay.data(), n_netcon * sizeof(double));
#endif
    for (int i = 0; i < n_netcon; ++i) {
        NetCon& nc = nt.netcons[i];
        nc.delay_ = delay[i];
    }
}

void Phase2::get_info_from_bbcore(NrnThread& nt,
                                  const std::vector<Memb_func>& memb_func,
                                  NrnThreadChkpnt& ntc) {
    // BBCOREPOINTER information
#if CHKPNTDEBUG
    ntc.nbcp = num_point_process;
    ntc.bcpicnt = new int[n_mech];
    ntc.bcpdcnt = new int[n_mech];
    ntc.bcptype = new int[n_mech];
    size_t point_proc_id = 0;
#endif
    for (int i = 0; i < n_mech; ++i) {
        int type = mech_types[i];
        if (!corenrn.get_bbcore_read()[type]) {
            continue;
        }
        type = tmls[i].type;  // This is not an error, but it has to be fixed I think
#if CHKPNTDEBUG
        ntc.bcptype[point_proc_id] = type;
        ntc.bcpicnt[point_proc_id] = tmls[i].iArray.size();
        ntc.bcpdcnt[point_proc_id] = tmls[i].dArray.size();
        point_proc_id++;
#endif
        int ik = 0;
        int dk = 0;
        Memb_list* ml = nt._ml_list[type];
        int dsz = corenrn.get_prop_param_size()[type];
        int pdsz = corenrn.get_prop_dparam_size()[type];
        int cntml = ml->nodecount;
        int layout = corenrn.get_mech_data_layout()[type];
        for (int j = 0; j < cntml; ++j) {
            int jp = j;
            if (ml->_permute) {
                jp = ml->_permute[j];
            }
            double* d = ml->data;
            Datum* pd = ml->pdata;
            d += nrn_i_layout(jp, cntml, 0, dsz, layout);
            pd += nrn_i_layout(jp, cntml, 0, pdsz, layout);
            int aln_cntml = nrn_soa_padded_size(cntml, layout);
            (*corenrn.get_bbcore_read()[type])(tmls[i].dArray.data(),
                                               tmls[i].iArray.data(),
                                               &dk,
                                               &ik,
                                               0,
                                               aln_cntml,
                                               d,
                                               pd,
                                               ml->_thread,
                                               &nt,
                                               ml,
                                               0.0);
        }
        assert(dk == static_cast<int>(tmls[i].dArray.size()));
        assert(ik == static_cast<int>(tmls[i].iArray.size()));
    }
}

void Phase2::set_vec_play(NrnThread& nt, NrnThreadChkpnt& ntc) {
    // VecPlayContinuous instances
    // No attempt at memory efficiency
    nt.n_vecplay = vec_play_continuous.size();
    if (nt.n_vecplay) {
        nt._vecplay = new void*[nt.n_vecplay];
    } else {
        nt._vecplay = nullptr;
    }
#if CHKPNTDEBUG
    ntc.vecplay_ix = new int[nt.n_vecplay];
    ntc.vtype = new int[nt.n_vecplay];
    ntc.mtype = new int[nt.n_vecplay];
#endif
    for (int i = 0; i < nt.n_vecplay; ++i) {
        auto& vecPlay = vec_play_continuous[i];
        nrn_assert(vecPlay.vtype == VecPlayContinuousType);
#if CHKPNTDEBUG
        ntc.vtype[i] = vecPlay.vtype;
#endif
#if CHKPNTDEBUG
        ntc.mtype[i] = vecPlay.mtype;
#endif
        Memb_list* ml = nt._ml_list[vecPlay.mtype];
#if CHKPNTDEBUG
        ntc.vecplay_ix[i] = vecPlay.ix;
#endif

        vecPlay.ix = nrn_param_layout(vecPlay.ix, vecPlay.mtype, ml);
        if (ml->_permute) {
            vecPlay.ix = nrn_index_permute(vecPlay.ix, vecPlay.mtype, ml);
        }
        nt._vecplay[i] = new VecPlayContinuous(ml->data + vecPlay.ix,
                                               std::move(vecPlay.yvec),
                                               std::move(vecPlay.tvec),
                                               nullptr,
                                               nt.id);
    }
}

void Phase2::populate(NrnThread& nt, const UserParams& userParams) {
    NrnThreadChkpnt& ntc = nrnthread_chkpnt[nt.id];
    ntc.file_id = userParams.gidgroups[nt.id];

    nt.ncell = n_real_cell;
    nt.end = n_node;
    nt.n_real_output = n_real_output;

#if CHKPNTDEBUG
    ntc.n_outputgids = n_output;
    ntc.nmech = n_mech;
#endif

    /// Checkpoint in coreneuron is defined for both phase 1 and phase 2 since they are written
    /// together
    nt._ml_list = (Memb_list**) ecalloc_align(corenrn.get_memb_funcs().size(), sizeof(Memb_list*));

    auto& memb_func = corenrn.get_memb_funcs();
#if CHKPNTDEBUG
    ntc.mlmap = new Memb_list_chkpnt*[memb_func.size()];
    for (int i = 0; i < memb_func.size(); ++i) {
        ntc.mlmap[i] = nullptr;
    }
#endif

    nt.stream_id = 0;
    nt.compute_gpu = 0;
    auto& nrn_prop_param_size_ = corenrn.get_prop_param_size();
    auto& nrn_prop_dparam_size_ = corenrn.get_prop_dparam_size();

/* read_phase2 is being called from openmp region
 * and hence we can set the stream equal to current thread id.
 * In fact we could set gid as stream_id when we will have nrn threads
 * greater than number of omp threads.
 */
#if defined(_OPENMP)
    nt.stream_id = omp_get_thread_num();
#endif

    int shadow_rhs_cnt = 0;
    nt.shadow_rhs_cnt = 0;

    NrnThreadMembList* tml_last = nullptr;
    for (int i = 0; i < n_mech; ++i) {
        auto tml =
            create_tml(nt, i, memb_func[mech_types[i]], shadow_rhs_cnt, mech_types, nodecounts);

        nt._ml_list[tml->index] = tml->ml;

#if CHKPNTDEBUG
        Memb_list_chkpnt* mlc = new Memb_list_chkpnt;
        ntc.mlmap[tml->index] = mlc;
#endif

        if (nt.tml) {
            tml_last->next = tml;
        } else {
            nt.tml = tml;
        }
        tml_last = tml;
    }

    if (shadow_rhs_cnt) {
        nt._shadow_rhs = (double*) ecalloc_align(nrn_soa_padded_size(shadow_rhs_cnt, 0),
                                                 sizeof(double));
        nt._shadow_d = (double*) ecalloc_align(nrn_soa_padded_size(shadow_rhs_cnt, 0),
                                               sizeof(double));
        nt.shadow_rhs_cnt = shadow_rhs_cnt;
    }

    nt.mapping = nullptr;  // section segment mapping

    nt._nidata = n_idata;
    if (nt._nidata)
        nt._idata = (int*) ecalloc(nt._nidata, sizeof(int));
    else
        nt._idata = nullptr;
    // see patternstim.cpp
    int extra_nv = (&nt == nrn_threads) ? nrn_extra_thread0_vdata : 0;
    nt._nvdata = n_vdata;
    if (nt._nvdata + extra_nv)
        nt._vdata = (void**) ecalloc_align(nt._nvdata + extra_nv, sizeof(void*));
    else
        nt._vdata = nullptr;

    // The data format begins with the matrix data
    int n_data_padded = nrn_soa_padded_size(nt.end, SOA_LAYOUT);
    nt._data = _data;
    nt._actual_rhs = nt._data + 0 * n_data_padded;
    nt._actual_d = nt._data + 1 * n_data_padded;
    nt._actual_a = nt._data + 2 * n_data_padded;
    nt._actual_b = nt._data + 3 * n_data_padded;
    nt._actual_v = nt._data + 4 * n_data_padded;
    nt._actual_area = nt._data + 5 * n_data_padded;
    nt._actual_diam = n_diam ? nt._data + 6 * n_data_padded : nullptr;

    size_t offset = 6 * n_data_padded;
    if (n_diam) {
        // in the rare case that a mechanism has dparam with diam semantics
        // then actual_diam array added after matrix in nt._data
        // Generally wasteful since only a few diam are pointed to.
        // Probably better to move the diam semantics to the p array of the mechanism
        offset += n_data_padded;
    }

    // Memb_list.data points into the nt._data array.
    // Also count the number of Point_process
    int num_point_process = 0;
    for (auto tml = nt.tml; tml; tml = tml->next) {
        Memb_list* ml = tml->ml;
        int type = tml->index;
        int layout = corenrn.get_mech_data_layout()[type];
        int n = ml->nodecount;
        int sz = nrn_prop_param_size_[type];
        offset = nrn_soa_byte_align(offset);
        ml->data = nt._data + offset;
        offset += nrn_soa_padded_size(n, layout) * sz;
        if (corenrn.get_pnt_map()[type] > 0) {
            num_point_process += n;
        }
    }
    nt.pntprocs = (Point_process*) ecalloc_align(num_point_process,
                                                 sizeof(Point_process));  // includes acell with and
                                                                          // without gid
    nt.n_pntproc = num_point_process;
    nt._ndata = offset;


    // matrix info
    nt._v_parent_index = v_parent_index;

#if CHKPNTDEBUG
    ntc.parent = new int[nt.end];
    memcpy(ntc.parent, nt._v_parent_index, nt.end * sizeof(int));
    ntc.area = new double[nt.end];
    memcpy(ntc.area, nt._actual_area, nt.end * sizeof(double));
#endif

    int synoffset = 0;
    std::vector<int> pnt_offset(memb_func.size());

    // All the mechanism data and pdata.
    // Also fill in the pnt_offset
    // Complete spec of Point_process except for the acell presyn_ field.
    int itml = 0;
    for (auto tml = nt.tml; tml; tml = tml->next, ++itml) {
        int type = tml->index;
        Memb_list* ml = tml->ml;
        int n = ml->nodecount;
        int szp = nrn_prop_param_size_[type];
        int szdp = nrn_prop_dparam_size_[type];
        int layout = corenrn.get_mech_data_layout()[type];

        ml->nodeindices = (int*) ecalloc_align(ml->nodecount, sizeof(int));
        std::copy(tmls[itml].nodeindices.begin(), tmls[itml].nodeindices.end(), ml->nodeindices);

        mech_data_layout_transform<double>(ml->data, n, szp, layout);

        if (szdp) {
            ml->pdata = (int*) ecalloc_align(nrn_soa_padded_size(n, layout) * szdp, sizeof(int));
            std::copy(tmls[itml].pdata.begin(), tmls[itml].pdata.end(), ml->pdata);
            mech_data_layout_transform<int>(ml->pdata, n, szdp, layout);

#if CHKPNTDEBUG  // Not substantive. Only for debugging.
            Memb_list_chkpnt* mlc = ntc.mlmap[type];
            mlc->pdata_not_permuted = (int*) coreneuron::ecalloc_align(n * szdp, sizeof(int));
            if (layout == Layout::AoS) {  // only copy
                for (int i = 0; i < n; ++i) {
                    for (int j = 0; j < szdp; ++j) {
                        mlc->pdata_not_permuted[i * szdp + j] = ml->pdata[i * szdp + j];
                    }
                }
            } else if (layout == Layout::SoA) {  // transpose and unpad
                int align_cnt = nrn_soa_padded_size(n, layout);
                for (int i = 0; i < n; ++i) {
                    for (int j = 0; j < szdp; ++j) {
                        mlc->pdata_not_permuted[i * szdp + j] = ml->pdata[i + j * align_cnt];
                    }
                }
            }
#endif
        } else {
            ml->pdata = nullptr;
        }
        if (corenrn.get_pnt_map()[type] > 0) {  // POINT_PROCESS mechanism including acell
            int cnt = ml->nodecount;
            Point_process* pnt = nullptr;
            pnt = nt.pntprocs + synoffset;
            pnt_offset[type] = synoffset;
            synoffset += cnt;
            for (int i = 0; i < cnt; ++i) {
                Point_process* pp = pnt + i;
                pp->_type = type;
                pp->_i_instance = i;
                nt._vdata[ml->pdata[nrn_i_layout(i, cnt, 1, szdp, layout)]] = pp;
                pp->_tid = nt.id;
            }
        }
    }

    // pnt_offset needed for SelfEvent transfer from NEURON. Not needed on GPU.
    // Ugh. Related but not same as NetReceiveBuffer._pnt_offset
    nt._pnt_offset = pnt_offset;

    pdata_relocation(nt, memb_func);

    /* if desired, apply the node permutation. This involves permuting
       at least the node parameter arrays for a, b, and area (and diam) and all
       integer vector values that index into nodes. This could have been done
       when originally filling the arrays with AoS ordered data, but can also
       be done now, after the SoA transformation. The latter has the advantage
       that the present order is consistent with all the layout values. Note
       that after this portion of the permutation, a number of other node index
       vectors will be read and will need to be permuted as well in subsequent
       sections of this function.
    */
    if (interleave_permute_type) {
        nt._permute = interleave_order(nt.id, nt.ncell, nt.end, nt._v_parent_index);
    }
    if (nt._permute) {
        int* p = nt._permute;
        permute_data(nt._actual_a, nt.end, p);
        permute_data(nt._actual_b, nt.end, p);
        permute_data(nt._actual_area, nt.end, p);
        permute_data(nt._actual_v,
                     nt.end,
                     p);  // need if restore or finitialize does not initialize voltage
        if (nt._actual_diam) {
            permute_data(nt._actual_diam, nt.end, p);
        }
        // index values change as well as ordering
        permute_ptr(nt._v_parent_index, nt.end, p);
        node_permute(nt._v_parent_index, nt.end, p);

#if CORENRN_DEBUG
        for (int i = 0; i < nt.end; ++i) {
            printf("parent[%d] = %d\n", i, nt._v_parent_index[i]);
        }
#endif

        // specify the ml->_permute and sort the nodeindices
        // Have to calculate all the permute before updating pdata in case
        // POINTER to data of other mechanisms exist.
        for (auto tml = nt.tml; tml; tml = tml->next) {
            if (tml->ml->nodeindices) {  // not artificial
                permute_nodeindices(tml->ml, p);
            }
        }
        for (auto tml = nt.tml; tml; tml = tml->next) {
            if (tml->ml->nodeindices) {  // not artificial
                permute_ml(tml->ml, tml->index, nt);
            }
        }

        // permute the Point_process._i_instance
        for (int i = 0; i < nt.n_pntproc; ++i) {
            Point_process& pp = nt.pntprocs[i];
            Memb_list* ml = nt._ml_list[pp._type];
            if (ml->_permute) {
                pp._i_instance = ml->_permute[pp._i_instance];
            }
        }
    }

    set_dependencies(nt, memb_func);

    fill_before_after_lists(nt, memb_func);

    // for fast watch statement checking
    // setup a list of types that have WATCH statement
    {
        int sz = 0;  // count the types with WATCH
        for (auto tml = nt.tml; tml; tml = tml->next) {
            if (corenrn.get_watch_check()[tml->index]) {
                ++sz;
            }
        }
        if (sz) {
            nt._watch_types = (int*) ecalloc(sz + 1, sizeof(int));  // nullptr terminated
            sz = 0;
            for (auto tml = nt.tml; tml; tml = tml->next) {
                if (corenrn.get_watch_check()[tml->index]) {
                    nt._watch_types[sz++] = tml->index;
                }
            }
        }
    }
    auto& pnttype2presyn = corenrn.get_pnttype2presyn();
    auto& nrn_has_net_event_ = corenrn.get_has_net_event();
    // create the nt.pnt2presyn_ix array of arrays.
    nt.pnt2presyn_ix = (int**) ecalloc(nrn_has_net_event_.size(), sizeof(int*));
    for (size_t i = 0; i < nrn_has_net_event_.size(); ++i) {
        Memb_list* ml = nt._ml_list[nrn_has_net_event_[i]];
        if (ml && ml->nodecount > 0) {
            nt.pnt2presyn_ix[i] = (int*) ecalloc(ml->nodecount, sizeof(int));
        }
    }

    // Real cells are at the beginning of the nt.presyns followed by
    // acells (with and without gids mixed together)
    // Here we associate the real cells with voltage pointers and
    // acell PreSyn with the Point_process.
    // nt.presyns order same as output_vindex order
#if CHKPNTDEBUG
    ntc.output_vindex = new int[nt.n_presyn];
    memcpy(ntc.output_vindex, output_vindex.data(), nt.n_presyn * sizeof(int));
#endif
    if (nt._permute) {
        // only indices >= 0 (i.e. _actual_v indices) will be changed.
        node_permute(output_vindex.data(), nt.n_presyn, nt._permute);
    }
#if CHKPNTDEBUG
    ntc.output_threshold = new double[n_real_output];
    memcpy(ntc.output_threshold, output_threshold.data(), n_real_output * sizeof(double));
#endif

    for (int i = 0; i < nt.n_presyn; ++i) {  // real cells
        PreSyn* ps = nt.presyns + i;

        int ix = output_vindex[i];
        if (ix == -1 && i < n_real_output) {  // real cell without a presyn
            continue;
        }
        if (ix < 0) {
            ix = -ix;
            int index = ix / 1000;
            int type = ix % 1000;
            Point_process* pnt = nt.pntprocs + (pnt_offset[type] + index);
            ps->pntsrc_ = pnt;
            // pnt->_presyn = ps;
            int ip2ps = pnttype2presyn[pnt->_type];
            if (ip2ps >= 0) {
                nt.pnt2presyn_ix[ip2ps][pnt->_i_instance] = i;
            }
            if (ps->gid_ < 0) {
                ps->gid_ = -1;
            }
        } else {
            assert(ps->gid_ > -1);
            ps->thvar_index_ = ix;  // index into _actual_v
            assert(ix < nt.end);
            ps->threshold_ = output_threshold[i];
        }
    }

    // initial net_send_buffer size about 1% of number of presyns
    // nt._net_send_buffer_size = nt.ncell/100 + 1;
    // but, to avoid reallocation complexity on GPU ...
    nt._net_send_buffer_size = n_real_output;
    nt._net_send_buffer = (int*) ecalloc_align(nt._net_send_buffer_size, sizeof(int));

    int nnetcon = nt.n_netcon;

    // it may happen that Point_process structures will be made unnecessary
    // by factoring into NetCon.

#if CHKPNTDEBUG
    ntc.pnttype = new int[nnetcon];
    ntc.pntindex = new int[nnetcon];
    memcpy(ntc.pnttype, pnttype.data(), nnetcon * sizeof(int));
    memcpy(ntc.pntindex, pntindex.data(), nnetcon * sizeof(int));
#endif
    for (int i = 0; i < nnetcon; ++i) {
        int type = pnttype[i];
        if (type > 0) {
            int index = pnt_offset[type] + pntindex[i];  /// Potentially uninitialized pnt_offset[],
                                                         /// check for previous assignments
            NetCon& nc = nt.netcons[i];
            nc.target_ = nt.pntprocs + index;
            nc.active_ = true;
        }
    }

    handle_weights(nt, nnetcon, ntc);

    get_info_from_bbcore(nt, memb_func, ntc);

    set_vec_play(nt, ntc);

    if (!events.empty()) {
        userParams.checkPoints.restore_tqueue(nt, *this);
    }

    set_net_send_buffer(nt._ml_list, pnt_offset);
}
}  // namespace coreneuron


================================================
FILE: coreneuron/io/phase2.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#pragma once

#include "coreneuron/io/nrn_filehandler.hpp"
#include "coreneuron/io/user_params.hpp"
#include "coreneuron/utils/ivocvect.hpp"

#include <memory>

namespace coreneuron {
struct NrnThread;
struct NrnThreadMembList;
struct Memb_func;
struct Memb_list;
struct NrnThreadChkpnt;

class Phase2 {
  public:
    void read_file(FileHandler& F, const NrnThread& nt);
    void read_direct(int thread_id, const NrnThread& nt);
    void populate(NrnThread& nt, const UserParams& userParams);

    std::vector<int> preSynConditionEventFlags;

    // All of this is public for nrn_checkpoint
    struct EventTypeBase {
        double time;
    };
    struct NetConType_: public EventTypeBase {
        int netcon_index;
    };
    struct SelfEventType_: public EventTypeBase {
        int target_type;
        int point_proc_instance;
        int target_instance;
        double flag;
        int movable;
        int weight_index;
    };
    struct PreSynType_: public EventTypeBase {
        int presyn_index;
    };
    struct NetParEvent_: public EventTypeBase {};
    struct PlayRecordEventType_: public EventTypeBase {
        int play_record_type;
        int vecplay_index;
    };

    struct VecPlayContinuous_ {
        int vtype;
        int mtype;
        int ix;
        IvocVect yvec;
        IvocVect tvec;

        int last_index;
        int discon_index;
        int ubound_index;
    };
    std::vector<VecPlayContinuous_> vec_play_continuous;
    int patstim_index;

    std::vector<std::pair<int, std::shared_ptr<EventTypeBase>>> events;

  private:
    void check_mechanism();
    void transform_int_data(int elem0,
                            int nodecount,
                            int* pdata,
                            int i,
                            int dparam_size,
                            int layout,
                            int n_node_);
    void set_net_send_buffer(Memb_list** ml_list, const std::vector<int>& pnt_offset);
    void restore_events(FileHandler& F);
    void fill_before_after_lists(NrnThread& nt, const std::vector<Memb_func>& memb_func);
    void pdata_relocation(const NrnThread& nt, const std::vector<Memb_func>& memb_func);
    void set_dependencies(const NrnThread& nt, const std::vector<Memb_func>& memb_func);
    void handle_weights(NrnThread& nt, int n_netcon, NrnThreadChkpnt& ntc);
    void get_info_from_bbcore(NrnThread& nt,
                              const std::vector<Memb_func>& memb_func,
                              NrnThreadChkpnt& ntc);
    void set_vec_play(NrnThread& nt, NrnThreadChkpnt& ntc);

    int n_real_cell;
    int n_output;
    int n_real_output;
    int n_node;
    int n_diam;  // 0 if not needed, else n_node
    int n_mech;
    std::vector<int> mech_types;
    std::vector<int> nodecounts;
    int n_idata;
    int n_vdata;
    int* v_parent_index;
    /* TO DO: when this is fixed use it like that
    std::vector<double> actual_a;
    std::vector<double> actual_b;
    std::vector<double> actual_area;
    std::vector<double> actual_v;
    std::vector<double> actual_diam;
    */
    double* _data;
    struct TML {
        std::vector<int> nodeindices;
        std::vector<int> pdata;
        int type;
        std::vector<int> iArray;
        std::vector<double> dArray;
        std::vector<int> pointer2type;
    };
    std::vector<TML> tmls;
    std::vector<int> output_vindex;
    std::vector<double> output_threshold;
    std::vector<int> pnttype;
    std::vector<int> pntindex;
    std::vector<double> weights;
    std::vector<double> delay;
    int num_point_process;
};
}  // namespace coreneuron


================================================
FILE: coreneuron/io/prcellstate.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#include <vector>
#include <map>

#include "coreneuron/nrnconf.h"
#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/io/nrn_setup.hpp"
#include "coreneuron/network/netcon.hpp"
#include "coreneuron/nrniv/nrniv_decl.h"
#include "coreneuron/utils/nrn_assert.h"
#include "coreneuron/coreneuron.hpp"
#include "coreneuron/utils/nrnoc_aux.hpp"

#define precision 15
namespace coreneuron {
static std::map<Point_process*, int> pnt2index;  // for deciding if NetCon is to be printed
static int pntindex;                             // running count of printed point processes.
static std::map<NetCon*, DiscreteEvent*> map_nc2src;
static std::vector<int>* inv_permute_;

static int permute(int i, NrnThread& nt) {
    return nt._permute ? nt._permute[i] : i;
}

static int inv_permute(int i, NrnThread& nt) {
    nrn_assert(i >= 0 && i < nt.end);
    if (!nt._permute) {
        return i;
    }
    if (!inv_permute_) {
        inv_permute_ = new std::vector<int>(nt.end);
        for (int i = 0; i < nt.end; ++i) {
            (*inv_permute_)[nt._permute[i]] = i;
        }
    }
    return (*inv_permute_)[i];
}

static int ml_permute(int i, Memb_list* ml) {
    return ml->_permute ? ml->_permute[i] : i;
}

// Note: cellnodes array is in unpermuted order.

static void pr_memb(int type, Memb_list* ml, int* cellnodes, NrnThread& nt, FILE* f) {
    if (corenrn.get_is_artificial()[type])
        return;

    bool header_printed = false;
    int size = corenrn.get_prop_param_size()[type];
    int psize = corenrn.get_prop_dparam_size()[type];
    bool receives_events = corenrn.get_pnt_receive()[type];
    int layout = corenrn.get_mech_data_layout()[type];
    int cnt = ml->nodecount;
    for (int iorig = 0; iorig < ml->nodecount; ++iorig) {  // original index
        int i = ml_permute(iorig, ml);                     // present index
        int inode = ml->nodeindices[i];                    // inode is the permuted node
        int cix = cellnodes[inv_permute(inode, nt)];       // original index relative to this cell
        if (cix >= 0) {
            if (!header_printed) {
                header_printed = true;
                fprintf(f, "type=%d %s size=%d\n", type, corenrn.get_memb_func(type).sym, size);
            }
            if (receives_events) {
                fprintf(f, "%d nri %d\n", cix, pntindex);
                int k = nrn_i_layout(i, cnt, 1, psize, layout);
                Point_process* pp = (Point_process*) nt._vdata[ml->pdata[k]];
                pnt2index[pp] = pntindex;
                ++pntindex;
            }
            for (int j = 0; j < size; ++j) {
                int k = nrn_i_layout(i, cnt, j, size, layout);
                fprintf(f, " %d %d %.*g\n", cix, j, precision, ml->data[k]);
            }
        }
    }
}

static void pr_netcon(NrnThread& nt, FILE* f) {
    if (pntindex == 0) {
        return;
    }
    // pnt2index table has been filled

    // List of NetCon for each of the NET_RECEIVE point process instances
    // Also create the initial map of NetCon <-> DiscreteEvent (PreSyn)
    std::vector<std::vector<NetCon*>> nclist(pntindex);
    map_nc2src.clear();
    int nc_cnt = 0;
    for (int i = 0; i < nt.n_netcon; ++i) {
        NetCon* nc = nt.netcons + i;
        Point_process* pp = nc->target_;
        std::map<Point_process*, int>::iterator it = pnt2index.find(pp);
        if (it != pnt2index.end()) {
            nclist[it->second].push_back(nc);
            map_nc2src[nc] = nullptr;
            ++nc_cnt;
        }
    }
    fprintf(f, "netcons %d\n", nc_cnt);
    fprintf(f, " pntindex srcgid active delay weights\n");

    /// Fill the NetCon <-> DiscreteEvent map with PreSyn-s
    // presyns can come from any thread
    for (int ith = 0; ith < nrn_nthread; ++ith) {
        NrnThread& ntps = nrn_threads[ith];
        for (int i = 0; i < ntps.n_presyn; ++i) {
            PreSyn* ps = ntps.presyns + i;
            for (int j = 0; j < ps->nc_cnt_; ++j) {
                NetCon* nc = netcon_in_presyn_order_[ps->nc_index_ + j];
                auto it_nc2src = map_nc2src.find(nc);
                if (it_nc2src != map_nc2src.end()) {
                    it_nc2src->second = ps;
                }
            }
        }
    }

    /// Fill the NetCon <-> DiscreteEvent map with InputPreSyn-s
    /// Traverse gid <-> InputPreSyn map and loop over NetCon-s of the
    /// correspondent InputPreSyn. If NetCon is in the nc2src map,
    /// remember its ips and the gid
    std::map<NetCon*, int> map_nc2gid;
    for (const auto& gid: gid2in) {
        InputPreSyn* ips = gid.second;  /// input presyn
        for (int i = 0; i < ips->nc_cnt_; ++i) {
            NetCon* nc = netcon_in_presyn_order_[ips->nc_index_ + i];
            auto it_nc2src = map_nc2src.find(nc);
            if (it_nc2src != map_nc2src.end()) {
                it_nc2src->second = ips;
                map_nc2gid[nc] = gid.first;  /// src gid of the input presyn
            }
        }
    }

    for (int i = 0; i < pntindex; ++i) {
        for (int j = 0; j < (int) (nclist[i].size()); ++j) {
            NetCon* nc = nclist[i][j];
            int srcgid = -3;
            auto it_nc2src = map_nc2src.find(nc);
            if (it_nc2src != map_nc2src.end()) {  // seems like there should be no NetCon which is
                                                  // not in the map
                DiscreteEvent* de = it_nc2src->second;
                if (de && de->type() == PreSynType) {
                    PreSyn* ps = (PreSyn*) de;
                    srcgid = ps->gid_;
                    Point_process* pnt = ps->pntsrc_;
                    if (srcgid < 0 && pnt) {
                        int type = pnt->_type;
                        fprintf(f,
                                "%d %s %d %.*g",
                                i,
                                corenrn.get_memb_func(type).sym,
                                nc->active_ ? 1 : 0,
                                precision,
                                nc->delay_);
                    } else if (srcgid < 0 && ps->thvar_index_ > 0) {
                        fprintf(
                            f, "%d %s %d %.*g", i, "v", nc->active_ ? 1 : 0, precision, nc->delay_);
                    } else {
                        fprintf(f,
                                "%d %d %d %.*g",
                                i,
                                srcgid,
                                nc->active_ ? 1 : 0,
                                precision,
                                nc->delay_);
                    }
                } else {
                    fprintf(f,
                            "%d %d %d %.*g",
                            i,
                            map_nc2gid[nc],
                            nc->active_ ? 1 : 0,
                            precision,
                            nc->delay_);
                }
            } else {
                fprintf(f, "%d %d %d %.*g", i, srcgid, nc->active_ ? 1 : 0, precision, nc->delay_);
            }
            int wcnt = corenrn.get_pnt_receive_size()[nc->target_->_type];
            for (int k = 0; k < wcnt; ++k) {
                fprintf(f, " %.*g", precision, nt.weights[nc->u.weight_index_ + k]);
            }
            fprintf(f, "\n");
        }
    }
    // cleanup
    nclist.clear();
}

static void pr_realcell(PreSyn& ps, NrnThread& nt, FILE* f) {
    // for associating NetCons with Point_process identifiers

    pntindex = 0;

    // threshold variable is a voltage
    printf("thvar_index_=%d end=%d\n", inv_permute(ps.thvar_index_, nt), nt.end);
    if (ps.thvar_index_ < 0 || ps.thvar_index_ >= nt.end) {
        hoc_execerror("gid not associated with a voltage", 0);
    }
    int inode = ps.thvar_index_;

    // and the root node is ...
    int rnode = inode;
    while (rnode >= nt.ncell) {
        rnode = nt._v_parent_index[rnode];
    }

    // count the number of nodes in the cell
    // do not assume all cell nodes except the root are contiguous
    // cellnodes is an unpermuted vector
    int* cellnodes = new int[nt.end];
    for (int i = 0; i < nt.end; ++i) {
        cellnodes[i] = -1;
    }
    int cnt = 0;
    cellnodes[inv_permute(rnode, nt)] = cnt++;
    for (int i = nt.ncell; i < nt.end; ++i) {  // think of it as unpermuted order
        if (cellnodes[inv_permute(nt._v_parent_index[permute(i, nt)], nt)] >= 0) {
            cellnodes[i] = cnt++;
        }
    }
    fprintf(f, "%d nodes  %d is the threshold node\n", cnt, cellnodes[inv_permute(inode, nt)] - 1);
    fprintf(f, " threshold %.*g\n", precision, ps.threshold_);
    fprintf(f, "inode parent area a b\n");
    for (int iorig = 0; iorig < nt.end; ++iorig)
        if (cellnodes[iorig] >= 0) {
            int i = permute(iorig, nt);
            int ip = nt._v_parent_index[i];
            fprintf(f,
                    "%d %d %.*g %.*g %.*g\n",
                    cellnodes[iorig],
                    ip >= 0 ? cellnodes[inv_permute(ip, nt)] : -1,
                    precision,
                    nt._actual_area[i],
                    precision,
                    nt._actual_a[i],
                    precision,
                    nt._actual_b[i]);
        }
    fprintf(f, "inode v\n");
    for (int i = 0; i < nt.end; ++i)
        if (cellnodes[i] >= 0) {
            fprintf(f, "%d %.*g\n", cellnodes[i], precision, nt._actual_v[permute(i, nt)]);
        }

    // each mechanism
    for (NrnThreadMembList* tml = nt.tml; tml; tml = tml->next) {
        pr_memb(tml->index, tml->ml, cellnodes, nt, f);
    }

    // the NetCon info (uses pnt2index)
    pr_netcon(nt, f);

    delete[] cellnodes;
    pnt2index.clear();
    if (inv_permute_) {
        delete inv_permute_;
        inv_permute_ = nullptr;
    }
}

int prcellstate(int gid, const char* suffix) {
    // search the NrnThread.presyns for the gid
    for (int ith = 0; ith < nrn_nthread; ++ith) {
        NrnThread& nt = nrn_threads[ith];
        for (int ip = 0; ip < nt.n_presyn; ++ip) {
            PreSyn& ps = nt.presyns[ip];
            if (ps.output_index_ == gid) {
                // found it so create a <gid>_<suffix>.corenrn file
                std::string filename = std::to_string(gid) + "_" + suffix + ".corenrn";
                FILE* f = fopen(filename.c_str(), "w");
                assert(f);
                fprintf(f, "gid = %d\n", gid);
                fprintf(f, "t = %.*g\n", precision, nt._t);
                fprintf(f, "celsius = %.*g\n", precision, celsius);
                if (ps.thvar_index_ >= 0) {
                    pr_realcell(ps, nt, f);
                }
                fclose(f);
                return 1;
            }
        }
    }
    return 0;
}
}  // namespace coreneuron


================================================
FILE: coreneuron/io/prcellstate.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#pragma once

namespace coreneuron {

extern int prcellstate(int gid, const char* suffix);

}  // namespace coreneuron


================================================
FILE: coreneuron/io/reports/binary_report_handler.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#include "binary_report_handler.hpp"
#ifdef ENABLE_BIN_REPORTS
#include "reportinglib/Records.h"
#endif  // ENABLE_BIN_REPORTS

namespace coreneuron {

void BinaryReportHandler::create_report(ReportConfiguration& config,
                                        double dt,
                                        double tstop,
                                        double delay) {
#ifdef ENABLE_BIN_REPORTS
    records_set_atomic_step(dt);
#endif  // ENABLE_BIN_REPORTS
    ReportHandler::create_report(config, dt, tstop, delay);
}

#ifdef ENABLE_BIN_REPORTS
static void create_soma_extra(const CellMapping& mapping, std::array<int, 5>& extra) {
    extra = {1, 0, 0, 0, 0};
    /* report extra "mask" all infos not written in report: here only soma count is reported */
    extra[1] = mapping.get_seclist_segment_count("soma");
}

static void create_compartment_extra(const CellMapping& mapping, std::array<int, 5>& extra) {
    extra[1] = mapping.get_seclist_section_count("soma");
    extra[2] = mapping.get_seclist_section_count("axon");
    extra[3] = mapping.get_seclist_section_count("dend");
    extra[4] = mapping.get_seclist_section_count("apic");
    extra[0] = std::accumulate(extra.begin() + 1, extra.end(), 0);
}

static void create_custom_extra(const CellMapping& mapping, std::array<int, 5>& extra) {
    extra = {1, 0, 0, 0, 1};
    extra[1] = mapping.get_seclist_section_count("soma");
    // extra[2] and extra[3]
    extra[4] = mapping.get_seclist_section_count("apic");
    extra[0] = std::accumulate(extra.begin() + 1, extra.end(), 0);
}

void BinaryReportHandler::register_section_report(const NrnThread& nt,
                                                  const ReportConfiguration& config,
                                                  const VarsToReport& vars_to_report,
                                                  bool is_soma_target) {
    create_extra_func create_extra = is_soma_target ? create_soma_extra : create_compartment_extra;
    register_report(nt, config, vars_to_report, create_extra);
}

void BinaryReportHandler::register_custom_report(const NrnThread& nt,
                                                 const ReportConfiguration& config,
                                                 const VarsToReport& vars_to_report) {
    create_extra_func create_extra = create_custom_extra;
    register_report(nt, config, vars_to_report, create_extra);
}

void BinaryReportHandler::register_report(const NrnThread& nt,
                                          const ReportConfiguration& config,
                                          const VarsToReport& vars_to_report,
                                          create_extra_func& create_extra) {
    int sizemapping = 1;
    int extramapping = 5;
    std::array<int, 1> mapping = {0};
    std::array<int, 5> extra;
    for (const auto& var: vars_to_report) {
        int gid = var.first;
        auto& vars = var.second;
        if (vars.empty()) {
            continue;
        }
        const auto* mapinfo = static_cast<NrnThreadMappingInfo*>(nt.mapping);
        const CellMapping* m = mapinfo->get_cell_mapping(gid);
        extra[0] = vars.size();
        create_extra(*m, extra);
        records_add_report(config.output_path.data(),
                           gid,
                           gid,
                           gid,
                           config.start,
                           config.stop,
                           config.report_dt,
                           sizemapping,
                           config.type_str.data(),
                           extramapping,
                           config.unit.data());

        records_set_report_max_buffer_size_hint(config.output_path.data(), config.buffer_size);
        records_extra_mapping(config.output_path.data(), gid, 5, extra.data());
        for (const auto& var: vars) {
            mapping[0] = var.id;
            records_add_var_with_mapping(
                config.output_path.data(), gid, var.var_value, sizemapping, mapping.data());
        }
    }
}
#endif  // ENABLE_BIN_REPORTS

}  // Namespace coreneuron


================================================
FILE: coreneuron/io/reports/binary_report_handler.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#pragma once

#include <functional>
#include <memory>
#include <vector>
#include <array>

#include "report_handler.hpp"
#include "coreneuron/io/nrnsection_mapping.hpp"

namespace coreneuron {

class BinaryReportHandler: public ReportHandler {
  public:
    void create_report(ReportConfiguration& config, double dt, double tstop, double delay) override;
#ifdef ENABLE_BIN_REPORTS
    void register_section_report(const NrnThread& nt,
                                 const ReportConfiguration& config,
                                 const VarsToReport& vars_to_report,
                                 bool is_soma_target) override;
    void register_custom_report(const NrnThread& nt,
                                const ReportConfiguration& config,
                                const VarsToReport& vars_to_report) override;

  private:
    using create_extra_func = std::function<void(const CellMapping&, std::array<int, 5>&)>;
    void register_report(const NrnThread& nt,
                         const ReportConfiguration& config,
                         const VarsToReport& vars_to_report,
                         create_extra_func& create_extra);
#endif  // ENABLE_BIN_REPORTS
};

}  // Namespace coreneuron


================================================
FILE: coreneuron/io/reports/nrnreport.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#include <iostream>
#include <vector>
#include <algorithm>
#include <map>
#include <set>
#include <cmath>

#include "coreneuron/network/netcon.hpp"
#include "coreneuron/utils/nrn_assert.h"
#include "coreneuron/network/netcvode.hpp"
#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/io/reports/nrnreport.hpp"
#include "coreneuron/io/nrnsection_mapping.hpp"
#include "coreneuron/mechanism/mech_mapping.hpp"
#include "coreneuron/mechanism/membfunc.hpp"
#ifdef ENABLE_BIN_REPORTS
#include "reportinglib/Records.h"
#endif
#ifdef ENABLE_SONATA_REPORTS
#include "bbp/sonata/reports.h"
#endif

namespace coreneuron {

// Size in MB of the report buffer
static int size_report_buffer = 4;

void nrn_flush_reports(double t) {
    // flush before buffer is full
#ifdef ENABLE_BIN_REPORTS
    records_end_iteration(t);
#endif
#ifdef ENABLE_SONATA_REPORTS
    sonata_check_and_flush(t);
#endif
}

/** in the current implementation, we call flush during every spike exchange
 *  interval. Hence there should be sufficient buffer to hold all reports
 *  for the duration of mindelay interval. In the below call we specify the
 *  number of timesteps that we have to buffer.
 *  TODO: revisit this because spike exchange can happen few steps before/after
 *  mindelay interval and hence adding two extra timesteps to buffer.
 */
void setup_report_engine(double dt_report, double mindelay) {
    int min_steps_to_record = static_cast<int>(std::round(mindelay / dt_report));
    static_cast<void>(min_steps_to_record);
#ifdef ENABLE_BIN_REPORTS
    records_set_min_steps_to_record(min_steps_to_record);
    records_setup_communicator();
    records_finish_and_share();
#endif
#ifdef ENABLE_SONATA_REPORTS
    sonata_set_min_steps_to_record(min_steps_to_record);
    sonata_setup_communicators();
    sonata_prepare_datasets();
#endif
}

// Size in MB of the report buffers
void set_report_buffer_size(int n) {
    size_report_buffer = n;
#ifdef ENABLE_BIN_REPORTS
    records_set_max_buffer_size_hint(size_report_buffer);
#endif
#ifdef ENABLE_SONATA_REPORTS
    sonata_set_max_buffer_size_hint(size_report_buffer);
#endif
}

void finalize_report() {
#ifdef ENABLE_BIN_REPORTS
    records_flush(nrn_threads[0]._t);
#endif
#ifdef ENABLE_SONATA_REPORTS
    sonata_flush(nrn_threads[0]._t);
#endif
}
}  // Namespace coreneuron


================================================
FILE: coreneuron/io/reports/nrnreport.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

/**
 * @file nrnreport.h
 * @brief interface with reportinglib for soma reports
 */

#ifndef _H_NRN_REPORT_
#define _H_NRN_REPORT_

#include <string>
#include <vector>
#include <set>
#include <unordered_map>
#include <cstdint>

#define REPORT_MAX_NAME_LEN     256
#define REPORT_MAX_FILEPATH_LEN 4096

namespace coreneuron {

struct SummationReport {
    // Contains the values of the summation with index == segment_id
    std::vector<double> summation_ = {};
    // Map containing the pointers of the currents and its scaling factor for every segment_id
    std::unordered_map<size_t, std::vector<std::pair<double*, int>>> currents_;
    // Map containing the list of segment_ids per gid
    std::unordered_map<int, std::vector<size_t>> gid_segments_;
};

struct SummationReportMapping {
    // Map containing a SummationReport object per report
    std::unordered_map<std::string, SummationReport> summation_reports_;
};

struct SpikesInfo {
    std::string file_name = "out";
    std::vector<std::pair<std::string, int>> population_info;
};

// name of the variable in mod file that is used to indicate which synapse
// is enabled or disable for reporting
#define SELECTED_VAR_MOD_NAME "selected_for_report"

/// name of the variable in mod file used for setting synapse id
#define SYNAPSE_ID_MOD_NAME "synapseID"

/*
 * Defines the type of target, as per the following syntax:
 *   0=Compartment, 1=Cell/Soma, Section { 2=Axon, 3=Dendrite, 4=Apical }
 * The "Comp" variations are compartment-based (all segments, not middle only)
 */
enum class TargetType {
    Compartment = 0,
    Cell = 1,
    SectionSoma = 2,
    SectionAxon = 3,
    SectionDendrite = 4,
    SectionApical = 5,
    SectionSomaAll = 6,
    SectionAxonAll = 7,
    SectionDendriteAll = 8,
    SectionApicalAll = 9,
};

// enumerate that defines the type of target report requested
enum ReportType {
    SomaReport,
    CompartmentReport,
    SynapseReport,
    IMembraneReport,
    SectionReport,
    SummationReport
};

// enumerate that defines the section type for a Section report
enum SectionType { Cell, Soma, Axon, Dendrite, Apical, All };

struct ReportConfiguration {
    std::string name;                     // name of the report
    std::string output_path;              // full path of the report
    std::string target_name;              // target of the report
    std::vector<std::string> mech_names;  // mechanism names
    std::vector<std::string> var_names;   // variable names
    std::vector<int> mech_ids;            // mechanisms
    std::string unit;                     // unit of the report
    std::string format;                   // format of the report (Bin, hdf5, SONATA)
    std::string type_str;                 // type of report string
    TargetType target_type;               // type of the target
    ReportType type;                      // type of the report
    SectionType section_type;             // type of section report
    bool section_all_compartments;        // flag for section report (all values)
    double report_dt;                     // reporting timestep
    double start;                         // start time of report
    double stop;                          // stop time of report
    int num_gids;                         // total number of gids
    int buffer_size;                      // hint on buffer size used for this report
    std::vector<int> target;              // list of gids for this report
};

void setup_report_engine(double dt_report, double mindelay);
std::vector<ReportConfiguration> create_report_configurations(const std::string& filename,
                                                              const std::string& output_dir,
                                                              SpikesInfo& spikes_info);
void finalize_report();
void nrn_flush_reports(double t);
void set_report_buffer_size(int n);

}  // namespace coreneuron

#endif  //_H_NRN_REPORT_


================================================
FILE: coreneuron/io/reports/report_configuration_parser.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#include <algorithm>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <iostream>
#include <limits>
#include <sstream>
#include <string>
#include <vector>

#include "coreneuron/io/reports/nrnreport.hpp"
#include "coreneuron/mechanism/mech_mapping.hpp"
#include "coreneuron/sim/fast_imem.hpp"
#include "coreneuron/utils/nrn_assert.h"
#include "coreneuron/utils/utils.hpp"

namespace coreneuron {


/*
 * Split filter comma separated strings ("mech.var_name") into mech_name and var_name
 */
void parse_filter_string(const std::string& filter, ReportConfiguration& config) {
    std::vector<std::string> mechanisms;
    std::stringstream ss(filter);
    std::string mechanism;
    // Multiple report variables are separated by `,`
    while (getline(ss, mechanism, ',')) {
        mechanisms.push_back(mechanism);

        // Split mechanism name and corresponding reporting variable
        std::string mech_name;
        std::string var_name;
        std::istringstream iss(mechanism);
        std::getline(iss, mech_name, '.');
        std::getline(iss, var_name, '.');
        if (var_name.empty()) {
            var_name = "i";
        }
        config.mech_names.emplace_back(mech_name);
        config.var_names.emplace_back(var_name);
        if (mech_name == "i_membrane") {
            nrn_use_fast_imem = true;
        }
    }
}

void register_target_type(ReportConfiguration& report, ReportType report_type) {
    report.type = report_type;
    switch (report.target_type) {
        case TargetType::Compartment:
            report.section_type = All;
            report.section_all_compartments = true;
            break;
        case TargetType::Cell:
            report.section_type = Cell;
            report.section_all_compartments = false;
            break;
        case TargetType::SectionSoma:
            report.section_type = Soma;
            report.section_all_compartments = false;
            break;
        case TargetType::SectionSomaAll:
            report.section_type = Soma;
            report.section_all_compartments = true;
            break;
        case TargetType::SectionAxon:
            report.section_type = Axon;
            report.section_all_compartments = false;
            break;
        case TargetType::SectionAxonAll:
            report.section_type = Axon;
            report.section_all_compartments = true;
            break;
        case TargetType::SectionDendrite:
            report.section_type = Dendrite;
            report.section_all_compartments = false;
            break;
        case TargetType::SectionDendriteAll:
            report.section_type = Dendrite;
            report.section_all_compartments = true;
            break;
        case TargetType::SectionApical:
            report.section_type = Apical;
            report.section_all_compartments = false;
            break;
        case TargetType::SectionApicalAll:
            report.section_type = Apical;
            report.section_all_compartments = true;
            break;
        default:
            std::cerr << "Report error: unsupported target type" << std::endl;
            nrn_abort(1);
    }
}

std::vector<ReportConfiguration> create_report_configurations(const std::string& conf_file,
                                                              const std::string& output_dir,
                                                              SpikesInfo& spikes_info) {
    std::string report_on;
    int target;
    std::ifstream report_conf(conf_file);

    int num_reports = 0;
    report_conf >> num_reports;
    std::vector<ReportConfiguration> reports(num_reports);
    for (auto& report: reports) {
        report.buffer_size = 4;  // default size to 4 Mb

        report_conf >> report.name >> report.target_name >> report.type_str >> report_on >>
            report.unit >> report.format >> target >> report.report_dt >> report.start >>
            report.stop >> report.num_gids >> report.buffer_size;

        report.target_type = static_cast<TargetType>(target);
        std::transform(report.type_str.begin(),
                       report.type_str.end(),
                       report.type_str.begin(),
                       [](unsigned char c) { return std::tolower(c); });
        report.output_path = output_dir + "/" + report.name;
        ReportType report_type;
        if (report.type_str == "compartment") {
            report_type = SectionReport;
            if (report_on == "i_membrane") {
                nrn_use_fast_imem = true;
                report_type = IMembraneReport;
            }
        } else if (report.type_str == "synapse") {
            report_type = SynapseReport;
        } else if (report.type_str == "summation") {
            report_type = SummationReport;
        } else {
            std::cerr << "Report error: unsupported type " << report.type_str << std::endl;
            nrn_abort(1);
        }
        register_target_type(report, report_type);
        if (report.type == SynapseReport || report.type == SummationReport) {
            parse_filter_string(report_on, report);
        }
        if (report.num_gids) {
            report.target.resize(report.num_gids);
            report_conf.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
            report_conf.read(reinterpret_cast<char*>(report.target.data()),
                             report.num_gids * sizeof(int));
            // extra new line: skip
            report_conf.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
        }
    }
    // read population information for spike report
    int num_populations;
    std::string spikes_population_name;
    int spikes_population_offset;
    if (report_conf.peek() == '\n') {
        // skip newline and move forward to spike reports
        report_conf.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
    }
    if (isdigit(report_conf.peek())) {
        report_conf >> num_populations;
    } else {
        // support old format: one single line "All"
        num_populations = 1;
    }
    for (int i = 0; i < num_populations; i++) {
        if (!(report_conf >> spikes_population_name >> spikes_population_offset)) {
            // support old format: one single line "All"
            report_conf >> spikes_population_name;
            spikes_population_offset = 0;
        }
        spikes_info.population_info.emplace_back(
            std::make_pair(spikes_population_name, spikes_population_offset));
    }
    report_conf >> spikes_info.file_name;

    return reports;
}
}  // namespace coreneuron


================================================
FILE: coreneuron/io/reports/report_event.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#include "report_event.hpp"
#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/io/reports/nrnreport.hpp"
#include "coreneuron/utils/nrn_assert.h"
#ifdef ENABLE_BIN_REPORTS
#include "reportinglib/Records.h"
#endif  // ENABLE_BIN_REPORTS
#ifdef ENABLE_SONATA_REPORTS
#include "bbp/sonata/reports.h"
#endif  // ENABLE_SONATA_REPORTS

namespace coreneuron {

#if defined(ENABLE_BIN_REPORTS) || defined(ENABLE_SONATA_REPORTS)
ReportEvent::ReportEvent(double dt,
                         double tstart,
                         const VarsToReport& filtered_gids,
                         const char* name,
                         double report_dt)
    : dt(dt)
    , tstart(tstart)
    , report_path(name)
    , report_dt(report_dt)
    , vars_to_report(filtered_gids) {
    nrn_assert(filtered_gids.size());
    step = tstart / dt;
    reporting_period = static_cast<int>(report_dt / dt);
    gids_to_report.reserve(filtered_gids.size());
    for (const auto& gid: filtered_gids) {
        gids_to_report.push_back(gid.first);
    }
    std::sort(gids_to_report.begin(), gids_to_report.end());
}

void ReportEvent::summation_alu(NrnThread* nt) {
    // Sum currents only on reporting steps
    if (step > 0 && (static_cast<int>(step) % reporting_period) == 0) {
        auto& summation_report = nt->summation_report_handler_->summation_reports_[report_path];
        // Add currents of all variables in each segment
        double sum = 0.0;
        for (const auto& kv: summation_report.currents_) {
            int segment_id = kv.first;
            for (const auto& value: kv.second) {
                double current_value = *value.first;
                int scale = value.second;
                sum += current_value * scale;
            }
            summation_report.summation_[segment_id] = sum;
            sum = 0.0;
        }
        // Add all currents in the soma
        // Only when type summation and soma target
        if (!summation_report.gid_segments_.empty()) {
            double sum_soma = 0.0;
            for (const auto& kv: summation_report.gid_segments_) {
                int gid = kv.first;
                for (const auto& segment_id: kv.second) {
                    sum_soma += summation_report.summation_[segment_id];
                }
                *(vars_to_report[gid].front().var_value) = sum_soma;
                sum_soma = 0.0;
            }
        }
    }
}

/** on deliver, call ReportingLib and setup next event */
void ReportEvent::deliver(double t, NetCvode* nc, NrnThread* nt) {
/* reportinglib is not thread safe */
#pragma omp critical
    {
        summation_alu(nt);
        // each thread needs to know its own step
#ifdef ENABLE_BIN_REPORTS
        records_nrec(step, gids_to_report.size(), gids_to_report.data(), report_path.data());
#endif
#ifdef ENABLE_SONATA_REPORTS
        sonata_record_node_data(step,
                                gids_to_report.size(),
                                gids_to_report.data(),
                                report_path.data());
#endif
        send(t + dt, nc, nt);
        step++;
    }
}

bool ReportEvent::require_checkpoint() {
    return false;
}
#endif  // defined(ENABLE_BIN_REPORTS) || defined(ENABLE_SONATA_REPORTS)

}  // Namespace coreneuron


================================================
FILE: coreneuron/io/reports/report_event.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#pragma once

#include <algorithm>
#include <unordered_map>
#include <vector>
#include <string>

#include "coreneuron/network/netcon.hpp"
#include "coreneuron/network/netcvode.hpp"

namespace coreneuron {

#if defined(ENABLE_BIN_REPORTS) || defined(ENABLE_SONATA_REPORTS)
struct VarWithMapping {
    uint32_t id;
    double* var_value;
    VarWithMapping(int id_, double* v_)
        : id(id_)
        , var_value(v_) {}
};

// mapping the set of variables pointers to report to its gid
using VarsToReport = std::unordered_map<uint64_t, std::vector<VarWithMapping>>;

class ReportEvent: public DiscreteEvent {
  public:
    ReportEvent(double dt,
                double tstart,
                const VarsToReport& filtered_gids,
                const char* name,
                double report_dt);

    /** on deliver, call ReportingLib and setup next event */
    void deliver(double t, NetCvode* nc, NrnThread* nt) override;
    bool require_checkpoint() override;
    void summation_alu(NrnThread* nt);

  private:
    double dt;
    double step;
    std::string report_path;
    double report_dt;
    int reporting_period;
    std::vector<int> gids_to_report;
    double tstart;
    VarsToReport vars_to_report;
};
#endif  // defined(ENABLE_BIN_REPORTS) || defined(ENABLE_SONATA_REPORTS)

}  // Namespace coreneuron


================================================
FILE: coreneuron/io/reports/report_handler.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#include "report_handler.hpp"
#include "coreneuron/io/nrnsection_mapping.hpp"
#include "coreneuron/mechanism/mech_mapping.hpp"
#include "coreneuron/utils/utils.hpp"

namespace coreneuron {

template <typename T>
std::vector<T> intersection_gids(const NrnThread& nt, std::vector<T>& target_gids) {
    std::vector<int> thread_gids;
    for (int i = 0; i < nt.ncell; i++) {
        thread_gids.push_back(nt.presyns[i].gid_);
    }
    std::vector<T> intersection;

    std::sort(thread_gids.begin(), thread_gids.end());
    std::sort(target_gids.begin(), target_gids.end());

    std::set_intersection(thread_gids.begin(),
                          thread_gids.end(),
                          target_gids.begin(),
                          target_gids.end(),
                          back_inserter(intersection));

    return intersection;
}

void ReportHandler::create_report(ReportConfiguration& report_config,
                                  double dt,
                                  double tstop,
                                  double delay) {
#if defined(ENABLE_BIN_REPORTS) || defined(ENABLE_SONATA_REPORTS)
    if (report_config.start < t) {
        report_config.start = t;
    }
    report_config.stop = std::min(report_config.stop, tstop);

    for (const auto& mech: report_config.mech_names) {
        report_config.mech_ids.emplace_back(nrn_get_mechtype(mech.data()));
    }
    if (report_config.type == SynapseReport && report_config.mech_ids.empty()) {
        std::cerr << "[ERROR] mechanism to report: " << report_config.mech_names[0]
                  << " is not mapped in this simulation, cannot report on it \n";
        nrn_abort(1);
    }
    for (int ith = 0; ith < nrn_nthread; ++ith) {
        NrnThread& nt = nrn_threads[ith];
        double* report_variable = nt._actual_v;
        if (!nt.ncell) {
            continue;
        }
        const std::vector<int>& nodes_to_gid = map_gids(nt);
        const std::vector<int> gids_to_report = intersection_gids(nt, report_config.target);
        VarsToReport vars_to_report;
        bool is_soma_target;
        switch (report_config.type) {
            case IMembraneReport:
                report_variable = nt.nrn_fast_imem->nrn_sav_rhs;
            case SectionReport:
                vars_to_report = get_section_vars_to_report(nt,
                                                            gids_to_report,
                                                            report_variable,
                                                            report_config.section_type,
                                                            report_config.section_all_compartments);
                is_soma_target = report_config.section_type == SectionType::Soma ||
                                 report_config.section_type == SectionType::Cell;
                register_section_report(nt, report_config, vars_to_report, is_soma_target);
                break;
            case SummationReport:
                vars_to_report =
                    get_summation_vars_to_report(nt, gids_to_report, report_config, nodes_to_gid);
                register_custom_report(nt, report_config, vars_to_report);
                break;
            default:
                vars_to_report =
                    get_synapse_vars_to_report(nt, gids_to_report, report_config, nodes_to_gid);
                register_custom_report(nt, report_config, vars_to_report);
        }
        if (!vars_to_report.empty()) {
            auto report_event = std::make_unique<ReportEvent>(
                dt, t, vars_to_report, report_config.output_path.data(), report_config.report_dt);
            report_event->send(t, net_cvode_instance, &nt);
            m_report_events.push_back(std::move(report_event));
        }
    }
#else
    if (nrnmpi_myid == 0) {
        std::cerr << "[WARNING] : Reporting is disabled. Please recompile with either libsonata or "
                     "reportinglib. \n";
    }
#endif  // defined(ENABLE_BIN_REPORTS) || defined(ENABLE_SONATA_REPORTS)
}

#if defined(ENABLE_BIN_REPORTS) || defined(ENABLE_SONATA_REPORTS)
void ReportHandler::register_section_report(const NrnThread& nt,
                                            const ReportConfiguration& config,
                                            const VarsToReport& vars_to_report,
                                            bool is_soma_target) {
    if (nrnmpi_myid == 0) {
        std::cerr << "[WARNING] : Format '" << config.format << "' in report '"
                  << config.output_path << "' not supported.\n";
    }
}
void ReportHandler::register_custom_report(const NrnThread& nt,
                                           const ReportConfiguration& config,
                                           const VarsToReport& vars_to_report) {
    if (nrnmpi_myid == 0) {
        std::cerr << "[WARNING] : Format '" << config.format << "' in report '"
                  << config.output_path << "' not supported.\n";
    }
}

std::string getSectionTypeStr(SectionType type) {
    switch (type) {
        case All:
            return "All";
        case Cell:
        case Soma:
            return "soma";
        case Axon:
            return "axon";
        case Dendrite:
            return "dend";
        case Apical:
            return "apic";
        default:
            std::cerr << "SectionType not handled in getSectionTypeStr" << std::endl;
            nrn_abort(1);
    }
}

void register_sections_to_report(const SecMapping* sections,
                                 std::vector<VarWithMapping>& to_report,
                                 double* report_variable,
                                 bool all_compartments) {
    for (const auto& section: sections->secmap) {
        // compartment_id
        int section_id = section.first;
        const auto& segment_ids = section.second;

        // get all compartment values (otherwise, just middle point)
        if (all_compartments) {
            for (const auto& segment_id: segment_ids) {
                // corresponding voltage in coreneuron voltage array
                double* variable = report_variable + segment_id;
                to_report.emplace_back(VarWithMapping(section_id, variable));
            }
        } else {
            nrn_assert(segment_ids.size() % 2);
            // corresponding voltage in coreneuron voltage array
            const auto segment_id = segment_ids[segment_ids.size() / 2];
            double* variable = report_variable + segment_id;
            to_report.emplace_back(VarWithMapping(section_id, variable));
        }
    }
}

VarsToReport ReportHandler::get_section_vars_to_report(const NrnThread& nt,
                                                       const std::vector<int>& gids_to_report,
                                                       double* report_variable,
                                                       SectionType section_type,
                                                       bool all_compartments) const {
    VarsToReport vars_to_report;
    const auto& section_type_str = getSectionTypeStr(section_type);
    const auto* mapinfo = static_cast<NrnThreadMappingInfo*>(nt.mapping);
    if (!mapinfo) {
        std::cerr << "[COMPARTMENTS] Error : mapping information is missing for a Cell group "
                  << nt.ncell << '\n';
        nrn_abort(1);
    }

    for (const auto& gid: gids_to_report) {
        const auto& cell_mapping = mapinfo->get_cell_mapping(gid);
        if (cell_mapping == nullptr) {
            std::cerr
                << "[COMPARTMENTS] Error : Compartment mapping information is missing for gid "
                << gid << '\n';
            nrn_abort(1);
        }
        std::vector<VarWithMapping> to_report;
        to_report.reserve(cell_mapping->size());

        if (section_type_str == "All") {
            const auto& section_mapping = cell_mapping->secmapvec;
            for (const auto& sections: section_mapping) {
                register_sections_to_report(sections, to_report, report_variable, all_compartments);
            }
        } else {
            /** get section list mapping for the type, if available */
            if (cell_mapping->get_seclist_section_count(section_type_str) > 0) {
                const auto& sections = cell_mapping->get_seclist_mapping(section_type_str);
                register_sections_to_report(sections, to_report, report_variable, all_compartments);
            }
        }
        vars_to_report[gid] = to_report;
    }
    return vars_to_report;
}

VarsToReport ReportHandler::get_summation_vars_to_report(
    const NrnThread& nt,
    const std::vector<int>& gids_to_report,
    const ReportConfiguration& report,
    const std::vector<int>& nodes_to_gids) const {
    VarsToReport vars_to_report;
    const auto* mapinfo = static_cast<NrnThreadMappingInfo*>(nt.mapping);
    auto& summation_report = nt.summation_report_handler_->summation_reports_[report.output_path];
    if (!mapinfo) {
        std::cerr << "[COMPARTMENTS] Error : mapping information is missing for a Cell group "
                  << nt.ncell << '\n';
        nrn_abort(1);
    }

    for (const auto& gid: gids_to_report) {
        bool has_imembrane = false;
        // In case we need convertion of units
        int scale = 1;
        for (auto i = 0; i < report.mech_ids.size(); ++i) {
            auto mech_id = report.mech_ids[i];
            auto var_name = report.var_names[i];
            auto mech_name = report.mech_names[i];
            if (mech_name != "i_membrane") {
                // need special handling for Clamp processes to flip the current value
                if (mech_name == "IClamp" || mech_name == "SEClamp") {
                    scale = -1;
                }
                Memb_list* ml = nt._ml_list[mech_id];
                if (!ml) {
                    continue;
                }

                for (int j = 0; j < ml->nodecount; j++) {
                    auto segment_id = ml->nodeindices[j];
                    if ((nodes_to_gids[ml->nodeindices[j]] == gid)) {
                        double* var_value =
                            get_var_location_from_var_name(mech_id, var_name.data(), ml, j);
                        summation_report.currents_[segment_id].push_back(
                            std::make_pair(var_value, scale));
                    }
                }
            } else {
                has_imembrane = true;
            }
        }
        const auto& cell_mapping = mapinfo->get_cell_mapping(gid);
        if (cell_mapping == nullptr) {
            std::cerr << "[SUMMATION] Error : Compartment mapping information is missing for gid "
                      << gid << '\n';
            nrn_abort(1);
        }
        std::vector<VarWithMapping> to_report;
        to_report.reserve(cell_mapping->size());
        summation_report.summation_.resize(nt.end);
        double* report_variable = summation_report.summation_.data();
        const auto& section_type_str = getSectionTypeStr(report.section_type);
        if (report.section_type != SectionType::All) {
            if (cell_mapping->get_seclist_section_count(section_type_str) > 0) {
                const auto& sections = cell_mapping->get_seclist_mapping(section_type_str);
                register_sections_to_report(sections,
                                            to_report,
                                            report_variable,
                                            report.section_all_compartments);
            }
        }
        const auto& section_mapping = cell_mapping->secmapvec;
        for (const auto& sections: section_mapping) {
            for (auto& section: sections->secmap) {
                // compartment_id
                int section_id = section.first;
                auto& segment_ids = section.second;
                for (const auto& segment_id: segment_ids) {
                    // corresponding voltage in coreneuron voltage array
                    if (has_imembrane) {
                        summation_report.currents_[segment_id].push_back(
                            std::make_pair(nt.nrn_fast_imem->nrn_sav_rhs + segment_id, 1));
                    }
                    if (report.section_type == SectionType::All) {
                        double* variable = report_variable + segment_id;
                        to_report.emplace_back(VarWithMapping(section_id, variable));
                    } else if (report.section_type == SectionType::Cell) {
                        summation_report.gid_segments_[gid].push_back(segment_id);
                    }
                }
            }
        }
        vars_to_report[gid] = to_report;
    }
    return vars_to_report;
}

VarsToReport ReportHandler::get_synapse_vars_to_report(
    const NrnThread& nt,
    const std::vector<int>& gids_to_report,
    const ReportConfiguration& report,
    const std::vector<int>& nodes_to_gids) const {
    VarsToReport vars_to_report;
    for (const auto& gid: gids_to_report) {
        // There can only be 1 mechanism
        nrn_assert(report.mech_ids.size() == 1);
        auto mech_id = report.mech_ids[0];
        auto var_name = report.var_names[0];
        Memb_list* ml = nt._ml_list[mech_id];
        if (!ml) {
            continue;
        }
        std::vector<VarWithMapping> to_report;
        to_report.reserve(ml->nodecount);

        for (int j = 0; j < ml->nodecount; j++) {
            double* is_selected =
                get_var_location_from_var_name(mech_id, SELECTED_VAR_MOD_NAME, ml, j);
            bool report_variable = false;

            /// if there is no variable in mod file then report on every compartment
            /// otherwise check the flag set in mod file
            if (is_selected == nullptr) {
                report_variable = true;
            } else {
                report_variable = *is_selected != 0.;
            }
            if ((nodes_to_gids[ml->nodeindices[j]] == gid) && report_variable) {
                double* var_value = get_var_location_from_var_name(mech_id, var_name.data(), ml, j);
                double* synapse_id =
                    get_var_location_from_var_name(mech_id, SYNAPSE_ID_MOD_NAME, ml, j);
                nrn_assert(synapse_id && var_value);
                to_report.emplace_back(static_cast<int>(*synapse_id), var_value);
            }
        }
        if (!to_report.empty()) {
            vars_to_report[gid] = to_report;
        }
    }
    return vars_to_report;
}

// map GIDs of every compartment, it consist in a backward sweep then forward sweep algorithm
std::vector<int> ReportHandler::map_gids(const NrnThread& nt) const {
    std::vector<int> nodes_gid(nt.end, -1);
    // backward sweep: from presyn compartment propagate back GID to parent
    for (int i = 0; i < nt.n_presyn; i++) {
        const int gid = nt.presyns[i].gid_;
        const int thvar_index = nt.presyns[i].thvar_index_;
        // only for non artificial cells
        if (thvar_index >= 0) {
            // setting all roots gids of the presyns nodes,
            // index 0 have parent set to 0, so we must stop at j > 0
            // also 0 is the parent of all, so it is an error to attribute a GID to it.
            nodes_gid[thvar_index] = gid;
            for (int j = thvar_index; j > 0; j = nt._v_parent_index[j]) {
                nodes_gid[nt._v_parent_index[j]] = gid;
            }
        }
    }
    // forward sweep: setting all compartements nodes to the GID of its root
    //  already sets on above loop. This is working only because compartments are stored in order
    //  parents follow by childrens
    for (int i = nt.ncell + 1; i < nt.end; i++) {
        nodes_gid[i] = nodes_gid[nt._v_parent_index[i]];
    }
    return nodes_gid;
}
#endif  // defined(ENABLE_BIN_REPORTS) || defined(ENABLE_SONATA_REPORTS)

}  // Namespace coreneuron


================================================
FILE: coreneuron/io/reports/report_handler.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#pragma once

#include <memory>
#include <vector>

#include "nrnreport.hpp"
#include "coreneuron/io/reports/report_event.hpp"
#include "coreneuron/sim/multicore.hpp"

namespace coreneuron {

class ReportHandler {
  public:
    virtual ~ReportHandler() = default;

    virtual void create_report(ReportConfiguration& config, double dt, double tstop, double delay);
#if defined(ENABLE_BIN_REPORTS) || defined(ENABLE_SONATA_REPORTS)
    virtual void register_section_report(const NrnThread& nt,
                                         const ReportConfiguration& config,
                                         const VarsToReport& vars_to_report,
                                         bool is_soma_target);
    virtual void register_custom_report(const NrnThread& nt,
                                        const ReportConfiguration& config,
                                        const VarsToReport& vars_to_report);
    VarsToReport get_section_vars_to_report(const NrnThread& nt,
                                            const std::vector<int>& gids_to_report,
                                            double* report_variable,
                                            SectionType section_type,
                                            bool all_compartments) const;
    VarsToReport get_summation_vars_to_report(const NrnThread& nt,
                                              const std::vector<int>& gids_to_report,
                                              const ReportConfiguration& report,
                                              const std::vector<int>& nodes_to_gids) const;
    VarsToReport get_synapse_vars_to_report(const NrnThread& nt,
                                            const std::vector<int>& gids_to_report,
                                            const ReportConfiguration& report,
                                            const std::vector<int>& nodes_to_gids) const;
    std::vector<int> map_gids(const NrnThread& nt) const;
#endif  // defined(ENABLE_BIN_REPORTS) || defined(ENABLE_SONATA_REPORTS)
  protected:
#if defined(ENABLE_BIN_REPORTS) || defined(ENABLE_SONATA_REPORTS)
    std::vector<std::unique_ptr<ReportEvent>> m_report_events;
#endif  // defined(ENABLE_BIN_REPORTS) || defined(ENABLE_SONATA_REPORTS)
};

}  // Namespace coreneuron


================================================
FILE: coreneuron/io/reports/sonata_report_handler.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#include "sonata_report_handler.hpp"
#include "coreneuron/network/netcvode.hpp"
#include "coreneuron/network/netcon.hpp"
#include "coreneuron/io/nrnsection_mapping.hpp"
#include "coreneuron/mechanism/mech_mapping.hpp"
#ifdef ENABLE_SONATA_REPORTS
#include "bbp/sonata/reports.h"
#endif  // ENABLE_SONATA_REPORTS

namespace coreneuron {

void SonataReportHandler::create_report(ReportConfiguration& config,
                                        double dt,
                                        double tstop,
                                        double delay) {
#ifdef ENABLE_SONATA_REPORTS
    sonata_set_atomic_step(dt);
#endif  // ENABLE_SONATA_REPORTS
    ReportHandler::create_report(config, dt, tstop, delay);
}

#ifdef ENABLE_SONATA_REPORTS
void SonataReportHandler::register_section_report(const NrnThread& nt,
                                                  const ReportConfiguration& config,
                                                  const VarsToReport& vars_to_report,
                                                  bool is_soma_target) {
    register_report(nt, config, vars_to_report);
}

void SonataReportHandler::register_custom_report(const NrnThread& nt,
                                                 const ReportConfiguration& config,
                                                 const VarsToReport& vars_to_report) {
    register_report(nt, config, vars_to_report);
}

std::pair<std::string, int> SonataReportHandler::get_population_info(int gid) {
    if (m_spikes_info.population_info.empty()) {
        return std::make_pair("All", 0);
    }
    std::pair<std::string, int> prev = m_spikes_info.population_info.front();
    for (const auto& name_offset: m_spikes_info.population_info) {
        std::string pop_name = name_offset.first;
        int pop_offset = name_offset.second;
        if (pop_offset > gid) {
            break;
        }
        prev = name_offset;
    }
    return prev;
}

void SonataReportHandler::register_report(const NrnThread& nt,
                                          const ReportConfiguration& config,
                                          const VarsToReport& vars_to_report) {
    sonata_create_report(config.output_path.data(),
                         config.start,
                         config.stop,
                         config.report_dt,
                         config.unit.data(),
                         config.type_str.data());
    sonata_set_report_max_buffer_size_hint(config.output_path.data(), config.buffer_size);

    for (const auto& kv: vars_to_report) {
        uint64_t gid = kv.first;
        const std::vector<VarWithMapping>& vars = kv.second;
        if (!vars.size())
            continue;

        const auto& pop_info = get_population_info(gid);
        std::string population_name = pop_info.first;
        int population_offset = pop_info.second;
        sonata_add_node(config.output_path.data(), population_name.data(), population_offset, gid);
        sonata_set_report_max_buffer_size_hint(config.output_path.data(), config.buffer_size);
        for (const auto& variable: vars) {
            sonata_add_element(config.output_path.data(),
                               population_name.data(),
                               gid,
                               variable.id,
                               variable.var_value);
        }
    }
}
#endif  // ENABLE_SONATA_REPORTS
}  // Namespace coreneuron


================================================
FILE: coreneuron/io/reports/sonata_report_handler.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#pragma once

#include <memory>
#include <vector>

#include "report_handler.hpp"

namespace coreneuron {

class SonataReportHandler: public ReportHandler {
  public:
    SonataReportHandler(const SpikesInfo& spikes_info)
        : m_spikes_info(spikes_info) {}

    void create_report(ReportConfiguration& config, double dt, double tstop, double delay) override;
#ifdef ENABLE_SONATA_REPORTS
    void register_section_report(const NrnThread& nt,
                                 const ReportConfiguration& config,
                                 const VarsToReport& vars_to_report,
                                 bool is_soma_target) override;
    void register_custom_report(const NrnThread& nt,
                                const ReportConfiguration& config,
                                const VarsToReport& vars_to_report) override;

  private:
    void register_report(const NrnThread& nt,
                         const ReportConfiguration& config,
                         const VarsToReport& vars_to_report);
    std::pair<std::string, int> get_population_info(int gid);
#endif  // ENABLE_SONATA_REPORTS

  private:
    SpikesInfo m_spikes_info;
};

}  // Namespace coreneuron


================================================
FILE: coreneuron/io/setup_fornetcon.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#include "coreneuron/coreneuron.hpp"
#include "coreneuron/io/setup_fornetcon.hpp"
#include "coreneuron/network/netcon.hpp"
#include "coreneuron/nrniv/nrniv_decl.h"
#include <map>
#include <utility>

namespace coreneuron {

/**
   If FOR_NETCON in use, setup NrnThread fornetcon related info.

   i.e NrnThread._fornetcon_perm_indices, NrnThread._fornetcon_weight_perm,
   and the relevant dparam element of each mechanism instance that uses
   a FOR_NETCONS statement.

   Makes use of nrn_fornetcon_cnt_, nrn_fornetcon_type_,
   and nrn_fornetcon_index_ that were specified during registration of
   mechanisms that use FOR_NETCONS.

   nrn_fornetcon_cnt_ is the number of mechanisms that use FOR_NETCONS,
   nrn_fornetcon_type_ is an int array of size nrn_fornetcon_cnt, that specifies
   the mechanism type.
   nrn_fornetcon_index_ is an int array of size nrn_fornetcon_cnt, that
   specifies the index into an instance's dparam int array having the
   fornetcon semantics.

   FOR_NETCONS (args) means to loop over all NetCon connecting to this
   target instance and args are the names of the items of each NetCon's
   weight vector (same as the enclosing NET_RECEIVE but possible different
   local names).

   NrnThread._weights is a vector of weight groups where the number of groups
   is the number of NetCon in this thread and each group has a size
   equal to the number of args in the target NET_RECEIVE block. The order
   of these groups is the NetCon Object order in HOC (the construction order).
   So the weight vector indices for the NetCons in the FOR_NETCONS loop
   are not adjacent.

   NrnThread._fornetcon_weight_perm is an index vector into the
   NrnThread._weight vector such that the list of indices that targets a
   mechanism instance are adjacent.
   NrnThread._fornetcon_perm_indices is an index vector into the
   NrnThread._fornetcon_weight_perm to the first of the list of NetCon weights
   that target the instance. The index of _fornetcon_perm_indices
   containing this first in the list is stored in the mechanism instances
   dparam at the dparam's semantic fornetcon slot. (Note that the next index
   points to the first index of the next target instance.)

**/

static int* fornetcon_slot(const int mtype,
                           const int instance,
                           const int fnslot,
                           const NrnThread& nt) {
    int layout = corenrn.get_mech_data_layout()[mtype];
    int sz = corenrn.get_prop_dparam_size()[mtype];
    Memb_list* ml = nt._ml_list[mtype];
    int* fn = nullptr;
    if (layout == Layout::AoS) {
        fn = ml->pdata + (instance * sz + fnslot);
    } else if (layout == Layout::SoA) {
        int padded_cnt = nrn_soa_padded_size(ml->nodecount, layout);
        fn = ml->pdata + (fnslot * padded_cnt + instance);
    }
    return fn;
}

void setup_fornetcon_info(NrnThread& nt) {
    if (nrn_fornetcon_cnt_ == 0) {
        return;
    }

    // Mechanism types in use that have FOR_NETCONS statements
    // Nice to have the dparam fornetcon slot as well so use map
    // instead of set
    std::map<int, int> type_to_slot;
    for (int i = 0; i < nrn_fornetcon_cnt_; ++i) {
        int type = nrn_fornetcon_type_[i];
        Memb_list* ml = nt._ml_list[type];
        if (ml && ml->nodecount) {
            type_to_slot[type] = nrn_fornetcon_index_[i];
        }
    }
    if (type_to_slot.empty()) {
        return;
    }

    // How many NetCons (weight groups) are involved.
    // Also count how many weight groups for each target instance.
    // For the latter we can count in the dparam fornetcon slot.

    // zero the dparam fornetcon slot for counting and count number of slots.
    size_t n_perm_indices = 0;
    for (const auto& kv: type_to_slot) {
        int mtype = kv.first;
        int fnslot = kv.second;
        int nodecount = nt._ml_list[mtype]->nodecount;
        for (int i = 0; i < nodecount; ++i) {
            int* fn = fornetcon_slot(mtype, i, fnslot, nt);
            *fn = 0;
            n_perm_indices += 1;
        }
    }

    // Count how many weight groups for each slot and total number of weight groups
    size_t n_weight_perm = 0;
    for (int i = 0; i < nt.n_netcon; ++i) {
        NetCon& nc = nt.netcons[i];
        int mtype = nc.target_->_type;
        auto search = type_to_slot.find(mtype);
        if (search != type_to_slot.end()) {
            int i_instance = nc.target_->_i_instance;
            int* fn = fornetcon_slot(mtype, i_instance, search->second, nt);
            *fn += 1;
            n_weight_perm += 1;
        }
    }

    // Displacement vector has an extra element since the number for last item
    // at n-1 is x[n] - x[n-1] and number for first is x[0] = 0.
    delete[] std::exchange(nt._fornetcon_perm_indices, nullptr);
    delete[] std::exchange(nt._fornetcon_weight_perm, nullptr);
    // Manual memory management because of needing to copy NrnThread to the GPU
    // and update device-side pointers there. Note the {} ensure the allocated
    // arrays are zero-initalised.
    nt._fornetcon_perm_indices_size = n_perm_indices + 1;
    nt._fornetcon_perm_indices = new size_t[nt._fornetcon_perm_indices_size]{};
    nt._fornetcon_weight_perm_size = n_weight_perm;
    nt._fornetcon_weight_perm = new size_t[nt._fornetcon_weight_perm_size]{};

    // From dparam fornetcon slots, compute displacement vector, and
    // set the dparam fornetcon slot to the index of the displacement vector
    // to allow later filling the _fornetcon_weight_perm.
    size_t i_perm_indices = 0;
    nt._fornetcon_perm_indices[0] = 0;
    for (const auto& kv: type_to_slot) {
        int mtype = kv.first;
        int fnslot = kv.second;
        int nodecount = nt._ml_list[mtype]->nodecount;
        for (int i = 0; i < nodecount; ++i) {
            int* fn = fornetcon_slot(mtype, i, fnslot, nt);
            nt._fornetcon_perm_indices[i_perm_indices + 1] =
                nt._fornetcon_perm_indices[i_perm_indices] + size_t(*fn);
            *fn = int(nt._fornetcon_perm_indices[i_perm_indices]);
            i_perm_indices += 1;
        }
    }

    // One more iteration over NetCon to fill in weight index for
    // nt._fornetcon_weight_perm. To help with this we increment the
    // dparam fornetcon slot on each use.
    for (int i = 0; i < nt.n_netcon; ++i) {
        NetCon& nc = nt.netcons[i];
        int mtype = nc.target_->_type;
        auto search = type_to_slot.find(mtype);
        if (search != type_to_slot.end()) {
            int i_instance = nc.target_->_i_instance;
            int* fn = fornetcon_slot(mtype, i_instance, search->second, nt);
            size_t nc_w_index = size_t(nc.u.weight_index_);
            nt._fornetcon_weight_perm[size_t(*fn)] = nc_w_index;
            *fn += 1;  // next item conceptually adjacent
        }
    }

    // Put back the proper values into the dparam fornetcon slot
    i_perm_indices = 0;
    for (const auto& kv: type_to_slot) {
        int mtype = kv.first;
        int fnslot = kv.second;
        int nodecount = nt._ml_list[mtype]->nodecount;
        for (int i = 0; i < nodecount; ++i) {
            int* fn = fornetcon_slot(mtype, i, fnslot, nt);
            *fn = int(i_perm_indices);
            i_perm_indices += 1;
        }
    }
}

}  // namespace coreneuron


================================================
FILE: coreneuron/io/setup_fornetcon.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#pragma once

#include "coreneuron/sim/multicore.hpp"

namespace coreneuron {

/**
   If FOR_NETCON in use, setup NrnThread fornetcon related info.
**/

void setup_fornetcon_info(NrnThread& nt);

}  // namespace coreneuron


================================================
FILE: coreneuron/io/user_params.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#pragma once

namespace coreneuron {

class CheckPoints;

/// This structure is data needed is several part of nrn_setup, phase1 and phase2.
/// Before it was globals variables, group them to give them as a single argument.
/// They have for the most part, nothing related to each other.
struct UserParams {
    UserParams(int ngroup_,
               int* gidgroups_,
               const char* path_,
               const char* restore_path_,
               CheckPoints& checkPoints_)
        : ngroup(ngroup_)
        , gidgroups(gidgroups_)
        , path(path_)
        , restore_path(restore_path_)
        , file_reader(ngroup_)
        , checkPoints(checkPoints_) {}

    /// direct memory mode with neuron, do not open files
    /// Number of local cell groups
    const int ngroup;
    /// Array of cell group numbers (indices)
    const int* const gidgroups;
    /// path to dataset file
    const char* const path;
    /// Dataset path from where simulation is being restored
    const char* const restore_path;
    std::vector<FileHandler> file_reader;
    CheckPoints& checkPoints;
};
}  // namespace coreneuron


================================================
FILE: coreneuron/mechanism/capac.cpp
================================================
/***
  THIS FILE IS AUTO GENERATED DONT MODIFY IT.
 ***/
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#include "coreneuron/coreneuron.hpp"
#include "coreneuron/permute/data_layout.hpp"

#define _PRAGMA_FOR_INIT_ACC_LOOP_                                                               \
    nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm]) if (_nt->compute_gpu)) \
    nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
#define _STRIDE _cntml_padded + _iml

namespace coreneuron {

static const char* mechanism[] = {"0", "capacitance", "cm", 0, "i_cap", 0, 0};
void nrn_alloc_capacitance(double*, Datum*, int);
void nrn_init_capacitance(NrnThread*, Memb_list*, int);
void nrn_jacob_capacitance(NrnThread*, Memb_list*, int);
void nrn_div_capacity(NrnThread*, Memb_list*, int);
void nrn_mul_capacity(NrnThread*, Memb_list*, int);

#define nparm 2

void capacitance_reg(void) {
    /* all methods deal with capacitance in special ways */
    register_mech(mechanism,
                  nrn_alloc_capacitance,
                  nullptr,
                  nullptr,
                  nullptr,
                  nrn_init_capacitance,
                  nullptr,
                  nullptr,
                  -1,
                  1);
    int mechtype = nrn_get_mechtype(mechanism[1]);
    _nrn_layout_reg(mechtype, SOA_LAYOUT);
    hoc_register_prop_size(mechtype, nparm, 0);
}

#define cm    vdata[0 * _STRIDE]
#define i_cap vdata[1 * _STRIDE]

/*
cj is analogous to 1/dt for cvode and daspk
for fixed step second order it is 2/dt and
for pure implicit fixed step it is 1/dt
It used to be static but is now a thread data variable
*/

void nrn_jacob_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) {
    int _cntml_actual = ml->nodecount;
    int _cntml_padded = ml->_nodecount_padded;
    int _iml;
    double* vdata;
    double cfac = .001 * _nt->cj;
    (void) _cntml_padded; /* unused when layout=1*/

    double* _vec_d = _nt->_actual_d;

    { /*if (use_cachevec) {*/
        int* ni = ml->nodeindices;

        vdata = ml->data;
        nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm],
                                             ni [0:_cntml_actual],
                                             _vec_d [0:_nt->end]) if (_nt->compute_gpu)
                           async(_nt->stream_id))
        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
        for (_iml = 0; _iml < _cntml_actual; _iml++) {
            _vec_d[ni[_iml]] += cfac * cm;
        }
    }
}

void nrn_init_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) {
    int _cntml_actual = ml->nodecount;
    int _cntml_padded = ml->_nodecount_padded;
    double* vdata;
    (void) _cntml_padded; /* unused */

    // skip initialization if restoring from checkpoint
    if (_nrn_skip_initmodel == 1) {
        return;
    }

    vdata = ml->data;
    _PRAGMA_FOR_INIT_ACC_LOOP_
    for (int _iml = 0; _iml < _cntml_actual; _iml++) {
        i_cap = 0;
    }
}

void nrn_cur_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) {
    int _cntml_actual = ml->nodecount;
    int _cntml_padded = ml->_nodecount_padded;
    double* vdata;
    double cfac = .001 * _nt->cj;

    /*@todo: verify cfac is being copied !! */

    (void) _cntml_padded; /* unused when layout=1*/

    /* since rhs is dvm for a full or half implicit step */
    /* (nrn_update_2d() replaces dvi by dvi-dvx) */
    /* no need to distinguish secondorder */
    int* ni = ml->nodeindices;
    double* _vec_rhs = _nt->_actual_rhs;

    vdata = ml->data;
    nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm],
                                         ni [0:_cntml_actual],
                                         _vec_rhs [0:_nt->end]) if (_nt->compute_gpu)
                       async(_nt->stream_id))
    nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
    for (int _iml = 0; _iml < _cntml_actual; _iml++) {
        i_cap = cfac * cm * _vec_rhs[ni[_iml]];
    }
}

/* the rest can be constructed automatically from the above info*/

void nrn_alloc_capacitance(double* data, Datum* pdata, int type) {
    (void) pdata;
    (void) type;      /* unused */
    data[0] = DEF_cm; /*default capacitance/cm^2*/
}

void nrn_div_capacity(NrnThread* _nt, Memb_list* ml, int type) {
    (void) type;
    int _cntml_actual = ml->nodecount;
    int _cntml_padded = ml->_nodecount_padded;
    int _iml;
    double* vdata;
    (void) _nt;
    (void) type;
    (void) _cntml_padded; /* unused */

    int* ni = ml->nodeindices;

    vdata = ml->data;
    _PRAGMA_FOR_INIT_ACC_LOOP_
    for (_iml = 0; _iml < _cntml_actual; _iml++) {
        i_cap = VEC_RHS(ni[_iml]);
        VEC_RHS(ni[_iml]) /= 1.e-3 * cm;
        // fprintf(stderr, "== nrn_div_cap: RHS[%d]=%.12f\n", ni[_iml], VEC_RHS(ni[_iml])) ;
    }
}

void nrn_mul_capacity(NrnThread* _nt, Memb_list* ml, int type) {
    (void) type;
    int _cntml_actual = ml->nodecount;
    int _cntml_padded = ml->_nodecount_padded;
    int _iml;
    double* vdata;
    (void) _nt;
    (void) type;
    (void) _cntml_padded; /* unused */

    int* ni = ml->nodeindices;

    const double cfac = .001 * _nt->cj;

    vdata = ml->data;
    _PRAGMA_FOR_INIT_ACC_LOOP_
    for (_iml = 0; _iml < _cntml_actual; _iml++) {
        VEC_RHS(ni[_iml]) *= cfac * cm;
    }
}
}  // namespace coreneuron


================================================
FILE: coreneuron/mechanism/eion.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

/// THIS FILE IS AUTO GENERATED DONT MODIFY IT.

#include <math.h>
#include <string.h>

#include "coreneuron/coreneuron.hpp"
#include "coreneuron/mpi/nrnmpi.h"
#include "coreneuron/mechanism/membfunc.hpp"
#include "coreneuron/permute/data_layout.hpp"
#include "coreneuron/utils/nrnoc_aux.hpp"

#define _STRIDE _cntml_padded + _iml

namespace coreneuron {

// for each ion it refers to internal concentration, external concentration, and charge,
const int ion_global_map_member_size = 3;


#define nparm 5
static const char* mechanism[] = {/*just a template*/
                                  "0",
                                  "na_ion",
                                  "ena",
                                  "nao",
                                  "nai",
                                  0,
                                  "ina",
                                  "dina_dv_",
                                  0,
                                  0};

void nrn_init_ion(NrnThread*, Memb_list*, int);
void nrn_alloc_ion(double*, Datum*, int);

static int na_ion, k_ion, ca_ion; /* will get type for these special ions */

int nrn_is_ion(int type) {
    // Old: commented to remove dependency on memb_func and alloc function
    // return (memb_func[type].alloc == ion_alloc);
    return (type < nrn_ion_global_map_size            // type smaller than largest ion's
            && nrn_ion_global_map[type] != nullptr);  // allocated ion charge variables
}

int nrn_ion_global_map_size;
double** nrn_ion_global_map;
#define global_conci(type)  nrn_ion_global_map[type][0]
#define global_conco(type)  nrn_ion_global_map[type][1]
#define global_charge(type) nrn_ion_global_map[type][2]

double nrn_ion_charge(int type) {
    return global_charge(type);
}

void ion_reg(const char* name, double valence) {
    char buf[7][50];
#define VAL_SENTINAL -10000.

    sprintf(buf[0], "%s_ion", name);
    sprintf(buf[1], "e%s", name);
    sprintf(buf[2], "%si", name);
    sprintf(buf[3], "%so", name);
    sprintf(buf[5], "i%s", name);
    sprintf(buf[6], "di%s_dv_", name);
    for (int i = 0; i < 7; i++) {
        mechanism[i + 1] = buf[i];
    }
    mechanism[5] = nullptr; /* buf[4] not used above */
    int mechtype = nrn_get_mechtype(buf[0]);
    if (mechtype >= nrn_ion_global_map_size ||
        nrn_ion_global_map[mechtype] == nullptr) {  // if hasn't yet been allocated

        // allocates mem for ion in ion_map and sets null all non-ion types
        if (nrn_ion_global_map_size <= mechtype) {
            int size = mechtype + 1;
            nrn_ion_global_map = (double**) erealloc(nrn_ion_global_map, sizeof(double*) * size);

            for (int i = nrn_ion_global_map_size; i < mechtype; i++) {
                nrn_ion_global_map[i] = nullptr;
            }
            nrn_ion_global_map_size = mechtype + 1;
        }
        nrn_ion_global_map[mechtype] = (double*) emalloc(ion_global_map_member_size *
                                                         sizeof(double));

        register_mech((const char**) mechanism,
                      nrn_alloc_ion,
                      nrn_cur_ion,
                      nullptr,
                      nullptr,
                      nrn_init_ion,
                      nullptr,
                      nullptr,
                      -1,
                      1);
        mechtype = nrn_get_mechtype(mechanism[1]);
        _nrn_layout_reg(mechtype, SOA_LAYOUT);
        hoc_register_prop_size(mechtype, nparm, 1);
        hoc_register_dparam_semantics(mechtype, 0, "iontype");
        nrn_writes_conc(mechtype, 1);

        {
            // See https://en.cppreference.com/w/cpp/io/c/fprintf: If a call to
            // sprintf or snprintf causes copying to take place between objects
            // that overlap, the behavior is undefined.
            std::string const old_buf_0{buf[0]};
            sprintf(buf[0], "%si0_%s", name, old_buf_0.c_str());
        }
        sprintf(buf[1], "%so0_%s", name, buf[0]);
        if (strcmp("na", name) == 0) {
            na_ion = mechtype;
            global_conci(mechtype) = DEF_nai;
            global_conco(mechtype) = DEF_nao;
            global_charge(mechtype) = 1.;
        } else if (strcmp("k", name) == 0) {
            k_ion = mechtype;
            global_conci(mechtype) = DEF_ki;
            global_conco(mechtype) = DEF_ko;
            global_charge(mechtype) = 1.;
        } else if (strcmp("ca", name) == 0) {
            ca_ion = mechtype;
            global_conci(mechtype) = DEF_cai;
            global_conco(mechtype) = DEF_cao;
            global_charge(mechtype) = 2.;
        } else {
            global_conci(mechtype) = DEF_ioni;
            global_conco(mechtype) = DEF_iono;
            global_charge(mechtype) = VAL_SENTINAL;
        }
    }
    double val = global_charge(mechtype);
    if (valence != VAL_SENTINAL && val != VAL_SENTINAL && valence != val) {
        fprintf(stderr,
                "%s ion valence defined differently in\n\
two USEION statements (%g and %g)\n",
                buf[0],
                valence,
                global_charge(mechtype));
        nrn_exit(1);
    } else if (valence == VAL_SENTINAL && val == VAL_SENTINAL) {
        fprintf(stderr,
                "%s ion valence must be defined in\n\
the USEION statement of any model using this ion\n",
                buf[0]);
        nrn_exit(1);
    } else if (valence != VAL_SENTINAL) {
        global_charge(mechtype) = valence;
    }
}

#if VECTORIZE
#define erev   pd[0 * _STRIDE] /* From Eion */
#define conci  pd[1 * _STRIDE]
#define conco  pd[2 * _STRIDE]
#define cur    pd[3 * _STRIDE]
#define dcurdv pd[4 * _STRIDE]

/*
 handle erev, conci, conc0 "in the right way" according to ion_style
 default. See nrn/lib/help/nrnoc.help.
ion_style("name_ion", [c_style, e_style, einit, eadvance, cinit])

 ica is assigned
 eca is parameter but if conc exists then eca is assigned
 if conc is nrnocCONST then eca calculated on finitialize
 if conc is STATE then eca calculated on fadvance and conc finitialize
        with global nai0, nao0

 nernst(ci, co, charge) and ghk(v, ci, co, charge) available to hoc
 and models.
*/

#define iontype ppd[_iml] /* how _AMBIGUOUS is to be handled */
/*the bitmap is
03	concentration unused, nrnocCONST, DEP, STATE
04	initialize concentrations
030	reversal potential unused, nrnocCONST, DEP, STATE
040	initialize reversal potential
0100	calc reversal during fadvance
0200	ci being written by a model
0400	co being written by a model
*/

#define charge global_charge(type)
#define conci0 global_conci(type)
#define conco0 global_conco(type)

double nrn_nernst_coef(int type) {
    /* for computing jacobian element dconc'/dconc */
    return ktf(celsius) / charge;
}

/* Must be called prior to any channels which update the currents */
void nrn_cur_ion(NrnThread* nt, Memb_list* ml, int type) {
    int _cntml_actual = ml->nodecount;
    double* pd;
    Datum* ppd;
    (void) nt; /* unused */
    /*printf("ion_cur %s\n", memb_func[type].sym->name);*/
    int _cntml_padded = ml->_nodecount_padded;
    pd = ml->data;
    ppd = ml->pdata;
    // clang-format off
    nrn_pragma_acc(parallel loop present(pd[0:_cntml_padded * 5],
                                         ppd[0:_cntml_actual],
                                         nrn_ion_global_map[0:nrn_ion_global_map_size]
                                                           [0:ion_global_map_member_size])
                                 if (nt->compute_gpu)
                                 async(nt->stream_id))
    // clang-format on
    nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
    for (int _iml = 0; _iml < _cntml_actual; ++_iml) {
        dcurdv = 0.;
        cur = 0.;
        if (iontype & 0100) {
            erev = nrn_nernst(conci, conco, charge, celsius);
        }
    };
}

/* Must be called prior to other models which possibly also initialize
        concentrations based on their own states
*/
void nrn_init_ion(NrnThread* nt, Memb_list* ml, int type) {
    int _cntml_actual = ml->nodecount;
    double* pd;
    Datum* ppd;
    (void) nt; /* unused */

    // skip initialization if restoring from checkpoint
    if (_nrn_skip_initmodel == 1) {
        return;
    }

    /*printf("ion_init %s\n", memb_func[type].sym->name);*/
    int _cntml_padded = ml->_nodecount_padded;
    pd = ml->data;
    ppd = ml->pdata;
    // There was no async(...) clause in the initial OpenACC implementation, so
    // no `nowait` clause has been added to the OpenMP implementation. TODO:
    // verify if this can be made asynchronous or if there is a strong reason it
    // needs to be like this.
    // clang-format off
    nrn_pragma_acc(parallel loop present(pd[0:_cntml_padded * 5],
                                         ppd[0:_cntml_actual],
                                         nrn_ion_global_map[0:nrn_ion_global_map_size]
                                                           [0:ion_global_map_member_size])
                                 if (nt->compute_gpu))
    // clang-format on
    nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
    for (int _iml = 0; _iml < _cntml_actual; ++_iml) {
        if (iontype & 04) {
            conci = conci0;
            conco = conco0;
        }
        if (iontype & 040) {
            erev = nrn_nernst(conci, conco, charge, celsius);
        }
    }
}

void nrn_alloc_ion(double* p, Datum* ppvar, int _type) {
    assert(0);
}

void second_order_cur(NrnThread* _nt, int secondorder) {
    int _cntml_padded;
    double* pd;
    (void) _nt; /* unused */
    double* _vec_rhs = _nt->_actual_rhs;

    if (secondorder == 2) {
        for (NrnThreadMembList* tml = _nt->tml; tml; tml = tml->next)
            if (nrn_is_ion(tml->index)) {
                Memb_list* ml = tml->ml;
                int _cntml_actual = ml->nodecount;
                int* ni = ml->nodeindices;
                _cntml_padded = ml->_nodecount_padded;
                pd = ml->data;
                nrn_pragma_acc(parallel loop present(pd [0:_cntml_padded * 5],
                                                     ni [0:_cntml_actual],
                                                     _vec_rhs [0:_nt->end]) if (_nt->compute_gpu)
                                   async(_nt->stream_id))
                nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
                for (int _iml = 0; _iml < _cntml_actual; ++_iml) {
                    cur += dcurdv * (_vec_rhs[ni[_iml]]);
                }
            }
    }
}
}  // namespace coreneuron
#endif


================================================
FILE: coreneuron/mechanism/eion.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

/// THIS FILE IS AUTO GENERATED DONT MODIFY IT.

#pragma once

namespace coreneuron {

extern int nrn_is_ion(int);
extern void ion_reg(const char*, double);

}  // namespace coreneuron


================================================
FILE: coreneuron/mechanism/mech/cfile/cabvars.h
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

namespace coreneuron {

extern void capacitance_reg(void), _passive_reg(void),
#if EXTRACELLULAR
    extracell_reg_(void),
#endif
    _stim_reg(void), _hh_reg(void), _netstim_reg(void), _expsyn_reg(void), _exp2syn_reg(void),
    _svclmp_reg(void);

static void (*mechanism[])(void) = {/* type will start at 3 */
                                    capacitance_reg,
                                    _passive_reg,
#if EXTRACELLULAR
                                    /* extracellular requires special handling and must be type 5 */
                                    extracell_reg_,
#endif
                                    nullptr};

}  // namespace coreneuron


================================================
FILE: coreneuron/mechanism/mech/enginemech.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

/**
 * \file
 * \brief Provides interface function for CoreNEURON mechanism library and NEURON
 *
 * libcorenrnmech is a interface library provided to building standalone executable
 * special-core. Also, it is used by NEURON to run CoreNEURON via dlopen to execute
 * models via in-memory transfer.
 */

#include <cstdlib>
#include <coreneuron/engine.h>

namespace coreneuron {

/** Mechanism registration function
 *
 * If external mechanisms present then use modl_reg function generated
 * in mod_func.cpp otherwise use empty one.
 */
#ifdef ADDITIONAL_MECHS
extern void modl_reg();
#else
void modl_reg() {}
#endif

/// variables defined in coreneuron library
extern bool nrn_have_gaps;
extern bool nrn_use_fast_imem;

/// function defined in coreneuron library
extern void nrn_cleanup_ion_map();
}  // namespace coreneuron

/** Initialize mechanisms and run simulation using CoreNEURON
 *
 * This is mainly used to build nrniv-core executable
 */
int solve_core(int argc, char** argv) {
    mk_mech_init(argc, argv);
    coreneuron::modl_reg();
    int ret = run_solve_core(argc, argv);
    coreneuron::nrn_cleanup_ion_map();
    return ret;
}

extern "C" {

/// global variables from coreneuron library
extern bool corenrn_embedded;
extern int corenrn_embedded_nthread;

/// parse arguments from neuron and prepare new one for coreneuron
char* prepare_args(int& argc, char**& argv, int use_mpi, const char* mpi_lib, const char* nrn_arg);

/// initialize standard mechanisms from coreneuron
void mk_mech_init(int argc, char** argv);

/// set openmp threads equal to neuron's pthread
void set_openmp_threads(int nthread);

/** Run CoreNEURON in embedded mode with NEURON
 *
 * @param nthread Number of Pthreads on NEURON side
 * @param have_gaps True if gap junctions are used
 * @param use_mpi True if MPI is used on NEURON side
 * @param use_fast_imem True if fast imembrance calculation enabled
 * @param nrn_arg Command line arguments passed by NEURON
 * @return 1 if embedded mode is used otherwise 0
 * \todo Change return type semantics
 */
int corenrn_embedded_run(int nthread,
                         int have_gaps,
                         int use_mpi,
                         int use_fast_imem,
                         const char* mpi_lib,
                         const char* nrn_arg) {
    // set coreneuron's internal variable based on neuron arguments
    corenrn_embedded = true;
    corenrn_embedded_nthread = nthread;
    coreneuron::nrn_have_gaps = have_gaps != 0;
    coreneuron::nrn_use_fast_imem = use_fast_imem != 0;

    // set number of openmp threads
    set_openmp_threads(nthread);

    // pre-process argumnets from neuron and prepare new for coreneuron
    int argc;
    char** argv;
    char* new_arg = prepare_args(argc, argv, use_mpi, mpi_lib, nrn_arg);

    // initialize internal arguments
    mk_mech_init(argc, argv);

    // initialize extra arguments built into special-core
    static bool modl_reg_called = false;
    if (!modl_reg_called) {
        coreneuron::modl_reg();
        modl_reg_called = true;
    }
    // run simulation
    run_solve_core(argc, argv);

    // free temporary string created from prepare_args
    free(new_arg);

    // delete array for argv
    delete[] argv;

    return corenrn_embedded ? 1 : 0;
}
}


================================================
FILE: coreneuron/mechanism/mech/mod2c_core_thread.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#pragma once

#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/mechanism/mechanism.hpp"
#include "coreneuron/utils/offload.hpp"

namespace coreneuron {

#define _STRIDE _cntml_padded + _iml

#define _threadargscomma_ _iml, _cntml_padded, _p, _ppvar, _thread, _nt, _ml, _v,
#define _threadargsprotocomma_                                                                    \
    int _iml, int _cntml_padded, double *_p, Datum *_ppvar, ThreadDatum *_thread, NrnThread *_nt, \
        Memb_list *_ml, double _v,
#define _threadargs_ _iml, _cntml_padded, _p, _ppvar, _thread, _nt, _ml, _v
#define _threadargsproto_                                                                         \
    int _iml, int _cntml_padded, double *_p, Datum *_ppvar, ThreadDatum *_thread, NrnThread *_nt, \
        Memb_list *_ml, double _v

struct Elm {
    unsigned row;        /* Row location */
    unsigned col;        /* Column location */
    double* value;       /* The value SOA  _cntml_padded of them*/
    struct Elm* r_up;    /* Link to element in same column */
    struct Elm* r_down;  /*       in solution order */
    struct Elm* c_left;  /* Link to left element in same row */
    struct Elm* c_right; /*       in solution order (see getelm) */
};

struct Item {
    Elm* elm{};
    unsigned norder{}; /* order of a row */
    Item* next{};
    Item* prev{};
};

using List = Item; /* list of mixed items */

struct SparseObj {            /* all the state information */
    Elm** rowst{};            /* link to first element in row (solution order)*/
    Elm** diag{};             /* link to pivot element in row (solution order)*/
    void* elmpool{};          /* no interthread cache line sharing for elements */
    unsigned neqn{};          /* number of equations */
    unsigned _cntml_padded{}; /* number of instances */
    unsigned* varord{};       /* row and column order for pivots */
    double* rhs{};            /* initially- right hand side        finally - answer */
    unsigned* ngetcall{};     /* per instance counter for number of calls to _getelm */
    int phase{};              /* 0-solution phase; 1-count phase; 2-build list phase */
    int numop{};
    unsigned coef_list_size{};
    double** coef_list{}; /* pointer to (first instance) value in _getelm order */
    /* don't really need the rest */
    int nroworder{};   /* just for freeing */
    Item** roworder{}; /* roworder[i] is pointer to order item for row i.
                             Does not have to be in orderlist */
    List* orderlist{}; /* list of rows sorted by norder
                             that haven't been used */
    int do_flag{};
};

extern void _nrn_destroy_sparseobj_thread(SparseObj* so);

// derived from nrn/src/scopmath/euler.c
// updated for aos/soa layout index
template <typename F>
int euler_thread(int neqn, int* var, int* der, F fun, _threadargsproto_) {
    double const dt{_nt->_dt};
    /* calculate the derivatives */
    fun(_threadargs_);  // std::invoke in C++17
    /* update dependent variables */
    for (int i = 0; i < neqn; i++) {
        _p[var[i] * _STRIDE] += dt * (_p[der[i] * _STRIDE]);
    }
    return 0;
}

template <typename F>
int derivimplicit_thread(int n, int* slist, int* dlist, F fun, _threadargsproto_) {
    fun(_threadargs_);  // std::invoke in C++17
    return 0;
}

void nrn_sparseobj_copyto_device(SparseObj* so);
void nrn_sparseobj_delete_from_device(SparseObj* so);

}  // namespace coreneuron


================================================
FILE: coreneuron/mechanism/mech/mod_func.c.pl
================================================
#!/usr/bin/perl
#
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================

#Construct the modl_reg() function from a provided list
#of modules.

#Usage : mod_func.c.pl[MECH1.mod MECH2.mod...]

@mods = @ARGV;
s/\.mod$// foreach @mods;

@mods=sort @mods;

if(!@mods) {
    print STDERR "mod_func.c.pl: No mod files provided";
    print "// No mod files provided
namespace coreneuron {
  void modl_reg() {}
}
";
    exit 0;
}

print << "__eof";
#include <cstdio>
namespace coreneuron {
extern int nrnmpi_myid;
extern int nrn_nobanner_;
extern int @{[join ",\n  ", map{"_${_}_reg(void)"} @mods]};

void modl_reg() {
    if (!nrn_nobanner_ && nrnmpi_myid < 1) {
        fprintf(stderr, " Additional mechanisms from files\\n");
        @{[join "\n        ",
           map{"fprintf(stderr, \" $_.mod\");"} @mods] }
        fprintf(stderr, "\\n\\n");
    }

    @{[join "\n    ", map{"_${_}_reg();"} @mods] }
}
} //namespace coreneuron
__eof


================================================
FILE: coreneuron/mechanism/mech/modfile/exp2syn.mod
================================================
COMMENT
Two state kinetic scheme synapse described by rise time tau1,
and decay time constant tau2. The normalized peak condunductance is 1.
Decay time MUST be greater than rise time.

The solution of A->G->bath with rate constants 1/tau1 and 1/tau2 is
 A = a*exp(-t/tau1) and
 G = a*tau2/(tau2-tau1)*(-exp(-t/tau1) + exp(-t/tau2))
    where tau1 < tau2

If tau2-tau1 is very small compared to tau1, this is an alphasynapse with time constant tau2.
If tau1/tau2 is very small, this is single exponential decay with time constant tau2.

The factor is evaluated in the initial block 
such that an event of weight 1 generates a
peak conductance of 1.

Because the solution is a sum of exponentials, the
coupled equations can be solved as a pair of independent equations
by the more efficient cnexp method.

ENDCOMMENT

NEURON {
    POINT_PROCESS Exp2Syn
    RANGE tau1, tau2, e, i
    NONSPECIFIC_CURRENT i

    RANGE g
}

UNITS {
    (nA) = (nanoamp)
    (mV) = (millivolt)
    (uS) = (microsiemens)
}

PARAMETER {
    tau1 = 0.1 (ms) <1e-9,1e9>
    tau2 = 10 (ms) <1e-9,1e9>
    e=0 (mV)
}

ASSIGNED {
    v (mV)
    i (nA)
    g (uS)
    factor
}

STATE {
    A (uS)
    B (uS)
}

INITIAL {
    LOCAL tp
    if (tau1/tau2 > 0.9999) {
        tau1 = 0.9999*tau2
    }
    if (tau1/tau2 < 1e-9) {
        tau1 = tau2*1e-9
    }
    A = 0
    B = 0
    tp = (tau1*tau2)/(tau2 - tau1) * log(tau2/tau1)
    factor = -exp(-tp/tau1) + exp(-tp/tau2)
    factor = 1/factor
}

BREAKPOINT {
    SOLVE state METHOD cnexp
    g = B - A
    i = g*(v - e)
}

DERIVATIVE state {
    A' = -A/tau1
    B' = -B/tau2
}

NET_RECEIVE(weight (uS)) {
    A = A + weight*factor
    B = B + weight*factor
}


================================================
FILE: coreneuron/mechanism/mech/modfile/expsyn.mod
================================================
NEURON {
	POINT_PROCESS ExpSyn
	RANGE tau, e, i
	NONSPECIFIC_CURRENT i
}

UNITS {
	(nA) = (nanoamp)
	(mV) = (millivolt)
	(uS) = (microsiemens)
}

PARAMETER {
	tau = 0.1 (ms) <1e-9,1e9>
	e = 0	(mV)
}

ASSIGNED {
	v (mV)
	i (nA)
}

STATE {
	g (uS)
}

INITIAL {
	g=0
}

BREAKPOINT {
	SOLVE state METHOD cnexp
	i = g*(v - e)
}

DERIVATIVE state {
	g' = -g/tau
}

NET_RECEIVE(weight (uS)) {
	g = g + weight
}


================================================
FILE: coreneuron/mechanism/mech/modfile/hh.mod
================================================
TITLE hh.mod   squid sodium, potassium, and leak channels
 
COMMENT
 This is the original Hodgkin-Huxley treatment for the set of sodium, 
  potassium, and leakage channels found in the squid giant axon membrane.
  ("A quantitative description of membrane current and its application 
  conduction and excitation in nerve" J.Physiol. (Lond.) 117:500-544 (1952).)
 Membrane voltage is in absolute mV and has been reversed in polarity
  from the original HH convention and shifted to reflect a resting potential
  of -65 mV.
 Remember to set celsius=6.3 (or whatever) in your HOC file.
 See squid.hoc for an example of a simulation using this model.
 SW Jaslove  6 March, 1992
ENDCOMMENT
 
UNITS {
        (mA) = (milliamp)
        (mV) = (millivolt)
	(S) = (siemens)
}
 
? interface
NEURON {
        SUFFIX hh
        USEION na READ ena WRITE ina
        USEION k READ ek WRITE ik
        NONSPECIFIC_CURRENT il
        RANGE gnabar, gkbar, gl, el, gna, gk
        :GLOBAL minf, hinf, ninf, mtau, htau, ntau
        RANGE minf, hinf, ninf, mtau, htau, ntau
	THREADSAFE : assigned GLOBALs will be per thread
}
 
PARAMETER {
        gnabar = .12 (S/cm2)	<0,1e9>
        gkbar = .036 (S/cm2)	<0,1e9>
        gl = .0003 (S/cm2)	<0,1e9>
        el = -54.3 (mV)
}
 
STATE {
        m h n
}
 
ASSIGNED {
        v (mV)
        celsius (degC)
        ena (mV)
        ek (mV)

	gna (S/cm2)
	gk (S/cm2)
        ina (mA/cm2)
        ik (mA/cm2)
        il (mA/cm2)
        minf hinf ninf
	mtau (ms) htau (ms) ntau (ms)
}
 
? currents
BREAKPOINT {
        SOLVE states METHOD cnexp
        gna = gnabar*m*m*m*h
	ina = gna*(v - ena)
        gk = gkbar*n*n*n*n
	ik = gk*(v - ek)      
        il = gl*(v - el)
}
 
 
INITIAL {
	rates(v)
	m = minf
	h = hinf
	n = ninf
}

? states
DERIVATIVE states {  
        rates(v)
        m' =  (minf-m)/mtau
        h' = (hinf-h)/htau
        n' = (ninf-n)/ntau
}
 
:LOCAL q10


? rates
PROCEDURE rates(v(mV)) {  :Computes rate and other constants at current v.
                      :Call once from HOC to initialize inf at resting v.
        LOCAL  alpha, beta, sum, q10
:        TABLE minf, mtau, hinf, htau, ninf, ntau DEPEND celsius FROM -100 TO 100 WITH 200

UNITSOFF
        q10 = 3^((celsius - 6.3)/10)
                :"m" sodium activation system
        alpha = .1 * vtrap(-(v+40),10)
        beta =  4 * exp(-(v+65)/18)
        sum = alpha + beta
	mtau = 1/(q10*sum)
        minf = alpha/sum
                :"h" sodium inactivation system
        alpha = .07 * exp(-(v+65)/20)
        beta = 1 / (exp(-(v+35)/10) + 1)
        sum = alpha + beta
	htau = 1/(q10*sum)
        hinf = alpha/sum
                :"n" potassium activation system
        alpha = .01*vtrap(-(v+55),10) 
        beta = .125*exp(-(v+65)/80)
	sum = alpha + beta
        ntau = 1/(q10*sum)
        ninf = alpha/sum
}
 
FUNCTION vtrap(x,y) {  :Traps for 0 in denominator of rate eqns.
        if (fabs(x/y) < 1e-6) {
                vtrap = y*(1 - x/y/2)
        }else{
                vtrap = x/(exp(x/y) - 1)
        }
}
 
UNITSON


================================================
FILE: coreneuron/mechanism/mech/modfile/netstim.mod
================================================
: $Id: netstim.mod 2212 2008-09-08 14:32:26Z hines $
: comments at end

: the Random idiom has been extended to support CoreNEURON.

: For backward compatibility, noiseFromRandom(hocRandom) can still be used
: as well as the default low-quality scop_exprand generator.
: However, CoreNEURON will not accept usage of the low-quality generator,
: and, if noiseFromRandom is used to specify the random stream, that stream
: must be using the Random123 generator.

: The recommended idiom for specfication of the random stream is to use
: noiseFromRandom123(id1, id2[, id3])

: If any instance uses noiseFromRandom123, then no instance can use noiseFromRandom
: and vice versa.

NEURON	{ 
  ARTIFICIAL_CELL NetStim
  RANGE interval, number, start
  RANGE noise
  THREADSAFE : only true if every instance has its own distinct Random
  BBCOREPOINTER donotuse
}

PARAMETER {
	interval	= 10 (ms) <1e-9,1e9>: time between spikes (msec)
	number	= 10 <0,1e9>	: number of spikes (independent of noise)
	start		= 50 (ms)	: start of first spike
	noise		= 0 <0,1>	: amount of randomness (0.0 - 1.0)
}

ASSIGNED {
	event (ms)
	on
	ispike
	donotuse
}

VERBATIM
#if NRNBBCORE /* running in CoreNEURON */

#define IFNEWSTYLE(arg) arg

#else /* running in NEURON */

/*
   1 means noiseFromRandom was called when _ran_compat was previously 0 .
   2 means noiseFromRandom123 was called when _ran_compat was previously 0.
*/
static int _ran_compat; /* specifies the noise style for all instances */
#define IFNEWSTYLE(arg) if(_ran_compat == 2) { arg }

#endif /* running in NEURON */
ENDVERBATIM

:backward compatibility
PROCEDURE seed(x) {
VERBATIM
#if !NRNBBCORE
ENDVERBATIM
	set_seed(x)
VERBATIM
#endif
ENDVERBATIM
}

INITIAL {

	VERBATIM
	  if (_p_donotuse) {
	    /* only this style initializes the stream on finitialize */
	    IFNEWSTYLE(nrnran123_setseq((nrnran123_State*)_p_donotuse, 0, 0);)
	  }
	ENDVERBATIM

	on = 0 : off
	ispike = 0
	if (noise < 0) {
		noise = 0
	}
	if (noise > 1) {
		noise = 1
	}
	if (start >= 0 && number > 0) {
		on = 1
		: randomize the first spike so on average it occurs at
		: start + noise*interval
		event = start + invl(interval) - interval*(1. - noise)
		: but not earlier than 0
		if (event < 0) {
			event = 0
		}
		net_send(event, 3)
	}
}	

PROCEDURE init_sequence(t(ms)) {
	if (number > 0) {
		on = 1
		event = 0
		ispike = 0
	}
}

FUNCTION invl(mean (ms)) (ms) {
	if (mean <= 0.) {
		mean = .01 (ms) : I would worry if it were 0.
	}
	if (noise == 0) {
		invl = mean
	}else{
		invl = (1. - noise)*mean + noise*mean*erand()
	}
}
VERBATIM
#include "nrnran123.h"

#if !NRNBBCORE
/* backward compatibility */
double nrn_random_pick(void* r);
void* nrn_random_arg(int argpos);
int nrn_random_isran123(void* r, uint32_t* id1, uint32_t* id2, uint32_t* id3);
int nrn_random123_setseq(void* r, uint32_t seq, char which);
int nrn_random123_getseq(void* r, uint32_t* seq, char* which);
#endif
ENDVERBATIM

FUNCTION erand() {
VERBATIM
	if (_p_donotuse) {
		/*
		:Supports separate independent but reproducible streams for
		: each instance. However, the corresponding hoc Random
		: distribution MUST be set to Random.negexp(1)
		*/
#if !NRNBBCORE
		if (_ran_compat == 2) {
			_lerand = nrnran123_negexp((nrnran123_State*)_p_donotuse);
		}else{
			_lerand = nrn_random_pick(_p_donotuse);
		}
#else
		_lerand = nrnran123_negexp((nrnran123_State*)_p_donotuse);
#endif
		return _lerand;
	}else{
#if NRNBBCORE
		assert(0);
#else
		/*
		: the old standby. Cannot use if reproducible parallel sim
		: independent of nhost or which host this instance is on
		: is desired, since each instance on this cpu draws from
		: the same stream
		*/
#endif
	}
#if !NRNBBCORE
ENDVERBATIM
	erand = exprand(1)
VERBATIM
#endif
ENDVERBATIM
}

PROCEDURE noiseFromRandom() {
VERBATIM
#if !NRNBBCORE
 {
	void** pv = (void**)(&_p_donotuse);
	if (_ran_compat == 2) {
		fprintf(stderr, "NetStim.noiseFromRandom123 was previously called\n");
		assert(0);
	}
	_ran_compat = 1;
	if (ifarg(1)) {
		*pv = nrn_random_arg(1);
	}else{
		*pv = (void*)0;
	}
 }
#endif
ENDVERBATIM
}


PROCEDURE noiseFromRandom123() {
VERBATIM
#if !NRNBBCORE
 {
	nrnran123_State** pv = (nrnran123_State**)(&_p_donotuse);
	if (_ran_compat == 1) {
		fprintf(stderr, "NetStim.noiseFromRandom was previously called\n");
		assert(0);
	}
	_ran_compat = 2;
	if (*pv) {
		nrnran123_deletestream(*pv);
		*pv = (nrnran123_State*)0;
	}
	if (ifarg(3)) {
		*pv = nrnran123_newstream3((uint32_t)*getarg(1), (uint32_t)*getarg(2), (uint32_t)*getarg(3));
	}else if (ifarg(2)) {
		*pv = nrnran123_newstream((uint32_t)*getarg(1), (uint32_t)*getarg(2));
	}
 }
#endif
ENDVERBATIM
}

DESTRUCTOR {
VERBATIM
	if (!noise) { return; }
	if (_p_donotuse) {
#if NRNBBCORE
		{ /* but note that mod2c does not translate DESTRUCTOR */
#else
		if (_ran_compat == 2) {
#endif
			nrnran123_State** pv = (nrnran123_State**)(&_p_donotuse);
			nrnran123_deletestream(*pv);
			*pv = (nrnran123_State*)0;
		}
	}
ENDVERBATIM
}

VERBATIM
static void bbcore_write(double* x, int* d, int* xx, int *offset, _threadargsproto_) {
	if (!noise) { return; }
	/* error if using the legacy scop_exprand */
	if (!_p_donotuse) {
		fprintf(stderr, "NetStim: cannot use the legacy scop_negexp generator for the random stream.\n");
		assert(0);
	}
	if (d) {
		char which;
		uint32_t* di = ((uint32_t*)d) + *offset;
#if !NRNBBCORE
		if (_ran_compat == 1) {
			void** pv = (void**)(&_p_donotuse);
			/* error if not using Random123 generator */
			if (!nrn_random_isran123(*pv, di, di+1, di+2)) {
				fprintf(stderr, "NetStim: Random123 generator is required\n");
				assert(0);
			}
			nrn_random123_getseq(*pv, di+3, &which);
			di[4] = (int)which;
		}else{
#else
    {
#endif
			nrnran123_State** pv = (nrnran123_State**)(&_p_donotuse);
			nrnran123_getids3(*pv, di, di+1, di+2);
			nrnran123_getseq(*pv, di+3, &which);
			di[4] = (int)which;
#if NRNBBCORE
			/* CORENeuron does not call DESTRUCTOR so... */
			nrnran123_deletestream(*pv);
                        *pv = (nrnran123_State*)0;
#endif
		}
		/*printf("Netstim bbcore_write %d %d %d\n", di[0], di[1], di[3]);*/
	}
	*offset += 5;
}

static void bbcore_read(double* x, int* d, int* xx, int* offset, _threadargsproto_) {
	if (!noise) { return; }
	/* Generally, CoreNEURON, in the context of psolve, begins with
           an empty model so this call takes place in the context of a freshly
           created instance and _p_donotuse is not NULL.
	   However, this function
           is also now called from NEURON at the end of coreneuron psolve
           in order to transfer back the nrnran123 sequence state. That
           allows continuation with a subsequent psolve within NEURON or
           properly transfer back to CoreNEURON if we continue the psolve
           there. So now, extra logic is needed for this call to work in
           a NEURON context.
        */

	uint32_t* di = ((uint32_t*)d) + *offset;
#if NRNBBCORE
	nrnran123_State** pv = (nrnran123_State**)(&_p_donotuse);
	assert(!_p_donotuse);
	*pv = nrnran123_newstream3(di[0], di[1], di[2]);
	nrnran123_setseq(*pv, di[3], (char)di[4]);
#else
	uint32_t id1, id2, id3;
	assert(_p_donotuse);
	if (_ran_compat == 1) { /* Hoc Random.Random123 */
		void** pv = (void**)(&_p_donotuse);
		int b = nrn_random_isran123(*pv, &id1, &id2, &id3);
		assert(b);
		nrn_random123_setseq(*pv, di[3], (char)di[4]);
	}else{
		assert(_ran_compat == 2);
		nrnran123_State** pv = (nrnran123_State**)(&_p_donotuse);
		nrnran123_getids3(*pv, &id1, &id2, &id3);
		nrnran123_setseq(*pv, di[3], (char)di[4]);
	}
        /* Random123 on NEURON side has same ids as on CoreNEURON side */
	assert(di[0] == id1 && di[1] == id2 && di[2] == id3);
#endif
	*offset += 5;
}
ENDVERBATIM

PROCEDURE next_invl() {
	if (number > 0) {
		event = invl(interval)
	}
	if (ispike >= number) {
		on = 0
	}
}

NET_RECEIVE (w) {
	if (flag == 0) { : external event
		if (w > 0 && on == 0) { : turn on spike sequence
			: but not if a netsend is on the queue
			init_sequence(t)
			: randomize the first spike so on average it occurs at
			: noise*interval (most likely interval is always 0)
			next_invl()
			event = event - interval*(1. - noise)
			net_send(event, 1)
		}else if (w < 0) { : turn off spiking definitively
			on = 0
		}
	}
	if (flag == 3) { : from INITIAL
		if (on == 1) { : but ignore if turned off by external event
			init_sequence(t)
			net_send(0, 1)
		}
	}
	if (flag == 1 && on == 1) {
		ispike = ispike + 1
		net_event(t)
		next_invl()
		if (on == 1) {
			net_send(event, 1)
		}
	}
}

FUNCTION bbsavestate() {
  bbsavestate = 0
  : limited to noiseFromRandom123
VERBATIM
#if !NRNBBCORE
  if (_ran_compat == 2) {
    nrnran123_State** pv = (nrnran123_State**)(&_p_donotuse);
    if (!*pv) { return 0.0; }
    char which;
    uint32_t seq;
    double *xdir, *xval;
    xdir = hoc_pgetarg(1);
    if (*xdir == -1.) { *xdir = 2; return 0.0; }
    xval = hoc_pgetarg(2);
    if (*xdir == 0.) {
      nrnran123_getseq(*pv, &seq, &which);
      xval[0] = (double)seq;
      xval[1] = (double)which;
    }
    if (*xdir == 1) {
      nrnran123_setseq(*pv, (uint32_t)xval[0], (char)xval[1]);
    }
  } /* else do nothing */
#endif
ENDVERBATIM
}


COMMENT
Presynaptic spike generator
---------------------------

This mechanism has been written to be able to use synapses in a single
neuron receiving various types of presynaptic trains.  This is a "fake"
presynaptic compartment containing a spike generator.  The trains
of spikes can be either periodic or noisy (Poisson-distributed)

Parameters;
   noise: 	between 0 (no noise-periodic) and 1 (fully noisy)
   interval: 	mean time between spikes (ms)
   number: 	number of spikes (independent of noise)

Written by Z. Mainen, modified by A. Destexhe, The Salk Institute

Modified by Michael Hines for use with CVode
The intrinsic bursting parameters have been removed since
generators can stimulate other generators to create complicated bursting
patterns with independent statistics (see below)

Modified by Michael Hines to use logical event style with NET_RECEIVE
This stimulator can also be triggered by an input event.
If the stimulator is in the on==0 state (no net_send events on queue)
 and receives a positive weight
event, then the stimulator changes to the on=1 state and goes through
its entire spike sequence before changing to the on=0 state. During
that time it ignores any positive weight events. If, in an on!=0 state,
the stimulator receives a negative weight event, the stimulator will
change to the on==0 state. In the on==0 state, it will ignore any ariving
net_send events. A change to the on==1 state immediately fires the first spike of
its sequence.

ENDCOMMENT


================================================
FILE: coreneuron/mechanism/mech/modfile/passive.mod
================================================
TITLE passive membrane channel

UNITS {
	(mV) = (millivolt)
	(mA) = (milliamp)
	(S) = (siemens)
}

NEURON {
	SUFFIX pas
	NONSPECIFIC_CURRENT i
	RANGE g, e
}

PARAMETER {
	g = .001	(S/cm2)	<0,1e9>
	e = -70	(mV)
}

ASSIGNED {v (mV)  i (mA/cm2)}

BREAKPOINT {
	i = g*(v - e)
}


================================================
FILE: coreneuron/mechanism/mech/modfile/pattern.mod
================================================
: The spikeout pairs (t, gid) resulting from a parallel network simulation
: can become the stimulus for any single cpu subnet as long as the gid's are
: consistent.
: Note: hoc must retain references to the tvec and gidvec vectors
: to prevent the Info from going out of existence

NEURON {
	ARTIFICIAL_CELL PatternStim
	RANGE fake_output
	THREADSAFE
	BBCOREPOINTER ptr
}

PARAMETER {
	fake_output = 0
}

ASSIGNED {
	ptr
}

INITIAL {
	if (initps() > 0) { net_send(0, 1) }
}

NET_RECEIVE (w) {LOCAL nst
	if (flag == 1) {
		nst = sendgroup()
		if (nst >= t) {net_send(nst - t, 1)}
	}
}

VERBATIM

struct Info {
	int size;
	double* tvec;
	int* gidvec;
	int index;
};

#define INFOCAST Info** ip = (Info**)(&(_p_ptr))

ENDVERBATIM


VERBATIM
Info* mkinfo(_threadargsproto_) {
	INFOCAST;
	Info* info = (Info*)hoc_Emalloc(sizeof(Info)); hoc_malchk();
	info->size = 0;
	info->tvec = nullptr;
	info->gidvec = nullptr;
	info->index = 0;
	return info;
}
/* for CoreNEURON checkpoint save and restore */
namespace coreneuron {
int checkpoint_save_patternstim(_threadargsproto_) {
	INFOCAST; Info* info = *ip;
	return info->index;
}
void checkpoint_restore_patternstim(int _index, double _te, _threadargsproto_) {
    INFOCAST; Info* info = *ip;
    info->index = _index;
    artcell_net_send(_tqitem, -1, (Point_process*)_nt->_vdata[_ppvar[1*_STRIDE]], _te, 1.0);
}
} //namespace coreneuron
ENDVERBATIM

FUNCTION initps() {
VERBATIM {
	INFOCAST; Info* info = *ip;
	info->index = 0;
	if (info && info->tvec) {
		_linitps = 1.;
	}else{
		_linitps = 0.;
	}
}
ENDVERBATIM
}

FUNCTION sendgroup() {
VERBATIM {
	INFOCAST; Info* info = *ip;
	int size = info->size;
	int fake_out;
	double* tvec = info->tvec;
	int* gidvec = info->gidvec;
	int i;
	fake_out = fake_output ? 1 : 0;
	for (i=0; info->index < size; ++i) {
		/* only if the gid is NOT on this machine */
		nrn_fake_fire(gidvec[info->index], tvec[info->index], fake_out);
		++info->index;
		if (i > 100 && t < tvec[info->index]) { break; }
	}
	if (info->index >= size) {
		_lsendgroup = t - 1.;
	}else{
		_lsendgroup = tvec[info->index];
	}
}
ENDVERBATIM
}

VERBATIM
static void bbcore_write(double* x, int* d, int* xx, int *offset, _threadargsproto_){}
static void bbcore_read(double* x, int* d, int* xx, int* offset, _threadargsproto_){}
namespace coreneuron {
void pattern_stim_setup_helper(int size, double* tv, int* gv, _threadargsproto_) {
	INFOCAST;
	Info* info = mkinfo(_threadargs_);
	*ip = info;
	info->size = size;
	info->tvec = tv;
	info->gidvec = gv;
    // initiate event chain (needed in case of restore)
	artcell_net_send ( _tqitem, -1, (Point_process*) _nt->_vdata[_ppvar[1*_STRIDE]], t +  0.0 , 1.0 ) ;
}

Info** pattern_stim_info_ref(_threadargsproto_) {
    // Info shared with NEURON.
    // So nrn <-> corenrn needs no actual transfer for direct mode psolve.
    INFOCAST;
    return ip; // Caller sets *ip to NEURON's PatternStim Info*
}

} // namespace coreneuron
ENDVERBATIM


================================================
FILE: coreneuron/mechanism/mech/modfile/stim.mod
================================================
COMMENT
Since this is an electrode current, positive values of i depolarize the cell
and in the presence of the extracellular mechanism there will be a change
in vext since i is not a transmembrane current but a current injected
directly to the inside of the cell.
ENDCOMMENT

NEURON {
	POINT_PROCESS IClamp
	RANGE del, dur, amp, i
	ELECTRODE_CURRENT i
}
UNITS {
	(nA) = (nanoamp)
}

PARAMETER {
	del (ms)
	dur (ms)	<0,1e9>
	amp (nA)
}
ASSIGNED { i (nA) }

INITIAL {
	i = 0
}

BREAKPOINT {
    : for fixed step methos, we can ignore at_time, was introduced for variable timestep, will be deprecated anyway. 
	: at_time(del)
	: at_time(del+dur)

	if (t < del + dur && t >= del) {
		i = amp
	}else{
		i = 0
	}
}


================================================
FILE: coreneuron/mechanism/mech/modfile/svclmp.mod
================================================
TITLE svclmp.mod
COMMENT
Single electrode Voltage clamp with three levels.
Clamp is on at time 0, and off at time
dur1+dur2+dur3. When clamp is off the injected current is 0.
The clamp levels are amp1, amp2, amp3.
i is the injected current, vc measures the control voltage)
Do not insert several instances of this model at the same location in order to
make level changes. That is equivalent to independent clamps and they will
have incompatible internal state values.
The electrical circuit for the clamp is exceedingly simple:
vc ---'\/\/`--- cell
        rs

Note that since this is an electrode current model v refers to the
internal potential which is equivalent to the membrane potential v when
there is no extracellular membrane mechanism present but is v+vext when
one is present.
Also since i is an electrode current,
positive values of i depolarize the cell. (Normally, positive membrane currents
are outward and thus hyperpolarize the cell)
ENDCOMMENT

INDEPENDENT {t FROM 0 TO 1 WITH 1 (ms)}

DEFINE NSTEP 3

NEURON {
	POINT_PROCESS SEClamp
	ELECTRODE_CURRENT i
	RANGE dur1, amp1, dur2, amp2, dur3, amp3, rs, vc, i
}

UNITS {
	(nA) = (nanoamp)
	(mV) = (millivolt)
	(uS) = (microsiemens)
}


PARAMETER {
	rs = 1 (megohm) <1e-9, 1e9>
	dur1 (ms) 	  amp1 (mV)
	dur2 (ms) <0,1e9> amp2 (mV)
	dur3 (ms) <0,1e9> amp3 (mV)
}

ASSIGNED {
	v (mV)	: automatically v + vext when extracellular is present
	i (nA)
	vc (mV)
	tc2 (ms)
	tc3 (ms)
	on
}

INITIAL {
	tc2 = dur1 + dur2
	tc3 = tc2 + dur3
	on = 0
}

BREAKPOINT {
	SOLVE icur METHOD after_cvode
	vstim()
}

PROCEDURE icur() {
	if (on) {
		i = (vc - v)/rs
	}else{
		i = 0
	}
}

COMMENT
The SOLVE of icur() in the BREAKPOINT block is necessary to compute
i=(vc - v(t))/rs instead of i=(vc - v(t-dt))/rs
This is important for time varying vc because the actual i used in
the implicit method is equivalent to (vc - v(t)/rs due to the
calculation of di/dv from the BREAKPOINT block.
The reason this works is because the SOLVE statement in the BREAKPOINT block
is executed after the membrane potential is advanced.

It is a shame that vstim has to be called twice but putting the call
in a SOLVE block would cause playing a Vector into vc to be off by one
time step.
ENDCOMMENT

PROCEDURE vstim() {
	on = 1
	if (dur1) {at_time(dur1)}
	if (dur2) {at_time(tc2)}
	if (dur3) {at_time(tc3)}
	if (t < dur1) {
		vc = amp1
	}else if (t < tc2) {
		vc = amp2
	}else if (t < tc3) {
		vc = amp3
	}else {
		vc = 0
		on = 0
	}
	icur()
}


================================================
FILE: coreneuron/mechanism/mech_mapping.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#include <cstring>
#include <cstdlib>
#include <iostream>
#include <map>

#include "coreneuron/mechanism/mech_mapping.hpp"
#include "coreneuron/mechanism/mechanism.hpp"
#include "coreneuron/permute/data_layout.hpp"

namespace coreneuron {
using Offset = size_t;
using MechId = int;
using VariableName = const char*;

struct cmp_str {
    bool operator()(char const* a, char const* b) const {
        return std::strcmp(a, b) < 0;
    }
};

/*
 * Structure that map variable names of mechanisms to their value's location (offset) in memory
 */
using MechNamesMapping = std::map<MechId, std::map<VariableName, Offset, cmp_str>>;
static MechNamesMapping mechNamesMapping;

static void set_an_offset(int mech_id, const char* variable_name, int offset) {
    mechNamesMapping[mech_id][variable_name] = offset;
}

double* get_var_location_from_var_name(int mech_id,
                                       const char* variable_name,
                                       Memb_list* ml,
                                       int node_index) {
    if (mechNamesMapping.find(mech_id) == mechNamesMapping.end()) {
        std::cerr << "ERROR : no variable name mapping exist for mechanism id: " << mech_id
                  << std::endl;
        abort();
    }
    if (mechNamesMapping.at(mech_id).find(variable_name) == mechNamesMapping.at(mech_id).end()) {
        std::cerr << "ERROR : no value associtated to variable name: " << variable_name
                  << std::endl;
        abort();
    }
    int variable_rank = mechNamesMapping.at(mech_id).at(variable_name);
    int ix = get_data_index(node_index, variable_rank, mech_id, ml);
    return &(ml->data[ix]);
}

void register_all_variables_offsets(int mech_id, SerializedNames variable_names) {
    int idx = 0;
    int nb_parsed_variables = 0;
    int current_categorie = 1;
    while (current_categorie < NB_MECH_VAR_CATEGORIES) {
        if (variable_names[idx]) {
            set_an_offset(mech_id, variable_names[idx], nb_parsed_variables);
            nb_parsed_variables++;
        } else {
            current_categorie++;
        }
        idx++;
    }
    idx++;
}

}  // namespace coreneuron


================================================
FILE: coreneuron/mechanism/mech_mapping.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#pragma once

/*
 * todo : currently mod2c has exactly 4 different variable categories
 * that are registered to coreneuron.
 */
#define NB_MECH_VAR_CATEGORIES 4

/*
 * SerializedNames
 *
 * names are passed serialized using the following format:
 * SerializedNames : {"0",[[<CategorieNames>,]*0,]* [[<CategorieNames>,]* 0]}
 * All categories must be filled, if they are emtpy, just an other 0 follow.
 *
 * ex: {"0", "name1", "name2", 0, "name3, "name4", 0,0,0}
 *     This means the first categorie with names {name1,name2},
 *     the second categorie with {name3, name4}, 2 last categories are empty
 */
namespace coreneuron {
struct Memb_list;

using SerializedNames = const char**;

// return pointer to value of a variable's mechanism, or nullptr if not found
extern double* get_var_location_from_var_name(int mech_id,
                                              const char* variable_name,
                                              Memb_list* ml,
                                              int local_index);

// initialize mapping of variable names of mechanism, to their places in memory
extern void register_all_variables_offsets(int mech_id, SerializedNames variable_names);

}  // namespace coreneuron


================================================
FILE: coreneuron/mechanism/mechanism.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#pragma once

#include <string.h>

#include "coreneuron/nrnconf.h"
#include "coreneuron/utils/memory.h"

namespace coreneuron {
// OpenACC with PGI compiler has issue when union is used and hence use struct
// \todo check if newer PGI versions has resolved this issue
#if defined(_OPENACC)
struct ThreadDatum {
    int i;
    double* pval;
    void* _pvoid;
};
#else
union ThreadDatum {
    double val;
    int i;
    double* pval;
    void* _pvoid;
};
#endif

/* will go away at some point */
struct Point_process {
    int _i_instance;
    short _type;
    short _tid; /* NrnThread id */
};

struct NetReceiveBuffer_t {
    int* _displ;     /* _displ_cnt + 1 of these */
    int* _nrb_index; /* _cnt of these (order of increasing _pnt_index) */

    int* _pnt_index;
    int* _weight_index;
    double* _nrb_t;
    double* _nrb_flag;
    int _cnt;
    int _displ_cnt; /* number of unique _pnt_index */
    int _size;      /* capacity */
    int _pnt_offset;
    size_t size_of_object() {
        size_t nbytes = 0;
        nbytes += _size * sizeof(int) * 3;
        nbytes += (_size + 1) * sizeof(int);
        nbytes += _size * sizeof(double) * 2;
        return nbytes;
    }
};

struct NetSendBuffer_t: MemoryManaged {
    int* _sendtype;  // net_send, net_event, net_move
    int* _vdata_index;
    int* _pnt_index;
    int* _weight_index;
    double* _nsb_t;
    double* _nsb_flag;
    int _cnt;
    int _size;       /* capacity */
    int reallocated; /* if buffer resized/reallocated, needs to be copy to cpu */

    NetSendBuffer_t(int size)
        : _size(size) {
        _cnt = 0;

        _sendtype = (int*) ecalloc_align(_size, sizeof(int));
        _vdata_index = (int*) ecalloc_align(_size, sizeof(int));
        _pnt_index = (int*) ecalloc_align(_size, sizeof(int));
        _weight_index = (int*) ecalloc_align(_size, sizeof(int));
        // when == 1, NetReceiveBuffer_t is newly allocated (i.e. we need to free previous copy
        // and recopy new data
        reallocated = 1;
        _nsb_t = (double*) ecalloc_align(_size, sizeof(double));
        _nsb_flag = (double*) ecalloc_align(_size, sizeof(double));
    }

    size_t size_of_object() {
        size_t nbytes = 0;
        nbytes += _size * sizeof(int) * 4;
        nbytes += _size * sizeof(double) * 2;
        return nbytes;
    }

    ~NetSendBuffer_t() {
        free_memory(_sendtype);
        free_memory(_vdata_index);
        free_memory(_pnt_index);
        free_memory(_weight_index);
        free_memory(_nsb_t);
        free_memory(_nsb_flag);
    }

    void grow() {
#ifdef CORENEURON_ENABLE_GPU
        int cannot_reallocate_on_device = 0;
        assert(cannot_reallocate_on_device);
#else
        int new_size = _size * 2;
        grow_buf(&_sendtype, _size, new_size);
        grow_buf(&_vdata_index, _size, new_size);
        grow_buf(&_pnt_index, _size, new_size);
        grow_buf(&_weight_index, _size, new_size);
        grow_buf(&_nsb_t, _size, new_size);
        grow_buf(&_nsb_flag, _size, new_size);
        _size = new_size;
#endif
    }

  private:
    template <typename T>
    void grow_buf(T** buf, int size, int new_size) {
        T* new_buf = nullptr;
        new_buf = (T*) ecalloc_align(new_size, sizeof(T));
        memcpy(new_buf, *buf, size * sizeof(T));
        free(*buf);
        *buf = new_buf;
    }
};

struct Memb_list {
    /* nodeindices contains all nodes this extension is responsible for,
     * ordered according to the matrix. This allows to access the matrix
     * directly via the nrn_actual_* arrays instead of accessing it in the
     * order of insertion and via the node-structure, making it more
     * cache-efficient */
    int* nodeindices = nullptr;
    int* _permute = nullptr;
    double* data = nullptr;
    Datum* pdata = nullptr;
    ThreadDatum* _thread = nullptr; /* thread specific data (when static is no good) */
    NetReceiveBuffer_t* _net_receive_buffer = nullptr;
    NetSendBuffer_t* _net_send_buffer = nullptr;
    int nodecount; /* actual node count */
    int _nodecount_padded;
    void* instance{nullptr}; /* mechanism instance struct */
    // nrn_acc_manager.cpp handles data movement to/from the accelerator as the
    // "private constructor" in the translated MOD file code is called before
    // the main nrn_acc_manager methods that copy thread/mechanism data to the
    // device
    void* global_variables{nullptr};
    std::size_t global_variables_size{};
};
}  // namespace coreneuron


================================================
FILE: coreneuron/mechanism/membfunc.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/
#pragma once

#include "coreneuron/mechanism/mechanism.hpp"
#include "coreneuron/utils/offload.hpp"
#include "coreneuron/utils/units.hpp"

#include <cmath>
#include <vector>

namespace coreneuron {

using Pfrpdat = Datum* (*) (void);

struct NrnThread;

using mod_alloc_t = void (*)(double*, Datum*, int);
using mod_f_t = void (*)(NrnThread*, Memb_list*, int);
using pnt_receive_t = void (*)(Point_process*, int, double);
using thread_table_check_t =
    void (*)(int, int, double*, Datum*, ThreadDatum*, NrnThread*, Memb_list*, int);

/*
 * Memb_func structure contains all related informations of a mechanism
 */
struct Memb_func {
    mod_alloc_t alloc;
    mod_f_t current;
    mod_f_t jacob;
    mod_f_t state;
    mod_f_t initialize;
    mod_f_t constructor;
    mod_f_t destructor; /* only for point processes */
    // These are used for CoreNEURON-internal allocation/cleanup; they are kept
    // separate from the CONSTRUCTOR/DESTRUCTOR functions just above (one of
    // which is apparently only for point processes) for simplicity.
    mod_f_t private_constructor;
    mod_f_t private_destructor;
    Symbol* sym;
    int vectorized;
    int thread_size_;                       /* how many Datum needed in Memb_list if vectorized */
    void (*thread_mem_init_)(ThreadDatum*); /* after Memb_list._thread is allocated */
    void (*thread_cleanup_)(ThreadDatum*);  /* before Memb_list._thread is freed */
    thread_table_check_t thread_table_check_;
    int is_point;
    void (*setdata_)(double*, Datum*);
    int* dparam_semantics; /* for nrncore writing. */
    ~Memb_func();
};

#define VINDEX       -1
#define CABLESECTION 1
#define MORPHOLOGY   2
#define CAP          3
#define EXTRACELL    5

#define nrnocCONST 1
#define DEP        2
#define STATE      3 /*See init.c and cabvars.h for order of nrnocCONST, DEP, and STATE */

#define BEFORE_INITIAL    0
#define AFTER_INITIAL     1
#define BEFORE_BREAKPOINT 2
#define AFTER_SOLVE       3
#define BEFORE_STEP       4
#define BEFORE_AFTER_SIZE 5 /* 1 more than the previous */
struct BAMech {
    mod_f_t f;
    int type;
    struct BAMech* next;
};

extern int nrn_ion_global_map_size;
extern double** nrn_ion_global_map;
extern const int ion_global_map_member_size;

#define NRNPOINTER                                                            \
    4 /* added on to list of mechanism variables.These are                    \
pointers which connect variables  from other mechanisms via the _ppval array. \
*/

#define _AMBIGUOUS 5


extern int nrn_get_mechtype(const char*);
extern const char* nrn_get_mechname(int);  // slow. use memb_func[i].sym if posible
extern int register_mech(const char** m,
                         mod_alloc_t alloc,
                         mod_f_t cur,
                         mod_f_t jacob,
                         mod_f_t stat,
                         mod_f_t initialize,
                         mod_f_t private_constructor,
                         mod_f_t private_destructor,
                         int nrnpointerindex,
                         int vectorized);
extern int point_register_mech(const char**,
                               mod_alloc_t alloc,
                               mod_f_t cur,
                               mod_f_t jacob,
                               mod_f_t stat,
                               mod_f_t initialize,
                               mod_f_t private_constructor,
                               mod_f_t private_destructor,
                               int nrnpointerindex,
                               mod_f_t constructor,
                               mod_f_t destructor,
                               int vectorized);
extern void register_constructor(mod_f_t constructor);
using NetBufReceive_t = void (*)(NrnThread*);
extern void hoc_register_net_receive_buffering(NetBufReceive_t, int);

extern void hoc_register_net_send_buffering(int);

using nrn_watch_check_t = void (*)(NrnThread*, Memb_list*);
extern void hoc_register_watch_check(nrn_watch_check_t, int);

extern void nrn_jacob_capacitance(NrnThread*, Memb_list*, int);
extern void nrn_writes_conc(int, int);
constexpr double ktf(double celsius) {
    return 1000. * units::gasconstant * (celsius + 273.15) / units::faraday;
}
// std::log isn't constexpr, but there are argument values for which nrn_nernst
// is a constant expression
constexpr double nrn_nernst(double ci, double co, double z, double celsius) {
    if (z == 0) {
        return 0.;
    }
    if (ci <= 0.) {
        return 1e6;
    } else if (co <= 0.) {
        return -1e6;
    } else {
        return ktf(celsius) / z * std::log(co / ci);
    }
}
constexpr void nrn_wrote_conc(int type,
                              double* p1,
                              int p2,
                              int it,
                              double** gimap,
                              double celsius,
                              int _cntml_padded) {
    if (it & 040) {
        constexpr int _iml = 0;
        int const STRIDE{_cntml_padded + _iml};
        /* passing _nt to this function causes cray compiler to segfault during compilation
         * hence passing _cntml_padded
         */
        double* pe = p1 - p2 * STRIDE;
        pe[0] = nrn_nernst(pe[1 * STRIDE], pe[2 * STRIDE], gimap[type][2], celsius);
    }
}
inline double nrn_ghk(double v, double ci, double co, double z, double celsius) {
    auto const efun = [](double x) {
        if (std::abs(x) < 1e-4) {
            return 1. - x / 2.;
        } else {
            return x / (std::exp(x) - 1.);
        }
    };
    double const temp{z * v / ktf(celsius)};
    double const eco{co * efun(+temp)};
    double const eci{ci * efun(-temp)};
    return .001 * z * units::faraday * (eci - eco);
}
extern void hoc_register_prop_size(int, int, int);
extern void hoc_register_dparam_semantics(int type, int, const char* name);
extern void hoc_reg_ba(int, mod_f_t, int);

struct DoubScal {
    const char* name;
    double* pdoub;
};
struct DoubVec {
    const char* name;
    double* pdoub;
    int index1;
};
struct VoidFunc {
    const char* name;
    void (*func)(void);
};
extern void hoc_register_var(DoubScal*, DoubVec*, VoidFunc*);

extern void _nrn_layout_reg(int, int);
extern void _nrn_thread_reg0(int i, void (*f)(ThreadDatum*));
extern void _nrn_thread_reg1(int i, void (*f)(ThreadDatum*));

using bbcore_read_t = void (*)(double*,
                               int*,
                               int*,
                               int*,
                               int,
                               int,
                               double*,
                               Datum*,
                               ThreadDatum*,
                               NrnThread*,
                               Memb_list*,
                               double);

using bbcore_write_t = void (*)(double*,
                                int*,
                                int*,
                                int*,
                                int,
                                int,
                                double*,
                                Datum*,
                                ThreadDatum*,
                                NrnThread*,
                                Memb_list*,
                                double);

extern int nrn_mech_depend(int type, int* dependencies);
extern int nrn_fornetcon_cnt_;
extern int* nrn_fornetcon_type_;
extern int* nrn_fornetcon_index_;
extern void add_nrn_fornetcons(int, int);
extern void add_nrn_has_net_event(int);
extern void net_event(Point_process*, double);
extern void net_send(void**, int, Point_process*, double, double);
extern void net_move(void**, Point_process*, double);
extern void artcell_net_send(void**, int, Point_process*, double, double);
extern void artcell_net_move(void**, Point_process*, double);
extern void nrn2ncs_outputevent(int netcon_output_index, double firetime);
extern bool nrn_use_localgid_;
extern void net_sem_from_gpu(int sendtype, int i_vdata, int, int ith, int ipnt, double, double);

// _OPENACC and/or NET_RECEIVE_BUFFERING
extern void net_sem_from_gpu(int, int, int, int, int, double, double);

extern void hoc_malchk(void); /* just a stub */
extern void* hoc_Emalloc(size_t);

}  // namespace coreneuron


================================================
FILE: coreneuron/mechanism/patternstim.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

// Want to have the classical NEURON PatternStim functionality available
// in coreneuron to allow debugging and trajectory verification on
// desktop single process tests.  Since pattern.mod provides most of what
// we need even in the coreneuron context, we placed a minimally modified
// version of that in coreneuron/mechanism/mech/modfile/pattern.mod and this file
// provides an interface that creates an instance of the
// PatternStim ARTIFICIAL_CELL in thread 0 and attaches the spike raster
// data to it.

#include <algorithm>

#include "coreneuron/nrnconf.h"
#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/nrniv/nrniv_decl.h"
#include "coreneuron/io/output_spikes.hpp"
#include "coreneuron/utils/nrn_assert.h"
#include "coreneuron/utils/nrnoc_aux.hpp"
#include "coreneuron/coreneuron.hpp"

namespace coreneuron {
// from translated patstim.mod
void _pattern_reg(void);
// from patstim.mod
extern void pattern_stim_setup_helper(int size,
                                      double* tvec,
                                      int* gidvec,
                                      int icnt,
                                      int cnt,
                                      double* _p,
                                      Datum* _ppvar,
                                      ThreadDatum* _thread,
                                      NrnThread* _nt,
                                      Memb_list* ml,
                                      double v);

static size_t read_raster_file(const char* fname, double** tvec, int** gidvec, double tstop);

int nrn_extra_thread0_vdata;

void nrn_set_extra_thread0_vdata() {
    // limited to PatternStim for now.
    // if called, must be called before nrn_setup and after mk_mech.
    int type = nrn_get_mechtype("PatternStim");
    if (!corenrn.get_memb_func(type).initialize) {
        _pattern_reg();
    }
    nrn_extra_thread0_vdata = corenrn.get_prop_dparam_size()[type];
}

// fname is the filename of an output_spikes.h format raster file.
// todo : add function for memory cleanup (to be called at the end of simulation)
void nrn_mkPatternStim(const char* fname, double tstop) {
    int type = nrn_get_mechtype("PatternStim");
    if (!corenrn.get_memb_func(type).sym) {
        printf("nrn_set_extra_thread_vdata must be called (after mk_mech, and before nrn_setup\n");
        assert(0);
    }

    // if there is empty thread then return, don't need patternstim
    if (nrn_threads == nullptr || nrn_threads->ncell == 0) {
        return;
    }

    double* tvec;
    int* gidvec;

    // todo : handle when spike raster will be very large (int < size_t)
    size_t size = read_raster_file(fname, &tvec, &gidvec, tstop);

    Point_process* pnt = nrn_artcell_instantiate("PatternStim");
    NrnThread* nt = nrn_threads + pnt->_tid;

    Memb_list* ml = nt->_ml_list[type];
    int layout = corenrn.get_mech_data_layout()[type];
    int sz = corenrn.get_prop_param_size()[type];
    int psz = corenrn.get_prop_dparam_size()[type];
    int _cntml = ml->nodecount;
    int _iml = pnt->_i_instance;
    double* _p = ml->data;
    Datum* _ppvar = ml->pdata;
    if (layout == Layout::AoS) {
        _p += _iml * sz;
        _ppvar += _iml * psz;
    } else if (layout == Layout::SoA) {
        ;
    } else {
        assert(0);
    }
    pattern_stim_setup_helper(size, tvec, gidvec, _iml, _cntml, _p, _ppvar, nullptr, nt, ml, 0.0);
}

size_t read_raster_file(const char* fname, double** tvec, int** gidvec, double tstop) {
    FILE* f = fopen(fname, "r");
    nrn_assert(f);

    // skip first line containing "scatter" string
    char dummy[100];
    nrn_assert(fgets(dummy, 100, f));

    std::vector<std::pair<double, int>> spikes;
    spikes.reserve(10000);

    double stime;
    int gid;

    while (fscanf(f, "%lf %d\n", &stime, &gid) == 2) {
        if (stime >= t && stime <= tstop) {
            spikes.push_back(std::make_pair(stime, gid));
        }
    }

    fclose(f);

    // pattern.mod expects sorted spike raster (this is to avoid
    // injecting all events at the begining of the simulation).
    // sort spikes according to time
    std::sort(spikes.begin(), spikes.end());

    // fill gid and time vectors
    *tvec = (double*) emalloc(spikes.size() * sizeof(double));
    *gidvec = (int*) emalloc(spikes.size() * sizeof(int));

    for (size_t i = 0; i < spikes.size(); i++) {
        (*tvec)[i] = spikes[i].first;
        (*gidvec)[i] = spikes[i].second;
    }

    return spikes.size();
}

// see nrn_setup.cpp:read_phase2 for how it creates NrnThreadMembList instances.
static NrnThreadMembList* alloc_nrn_thread_memb(NrnThread* nt, int type) {
    NrnThreadMembList* tml = (NrnThreadMembList*) ecalloc(1, sizeof(NrnThreadMembList));
    tml->dependencies = nullptr;
    tml->ndependencies = 0;
    tml->index = type;
    tml->next = nullptr;

    // fill in tml->ml info. The data is not in the cache efficient
    // NrnThread arrays but there should not be many of these instances.
    int psize = corenrn.get_prop_param_size()[type];
    int dsize = corenrn.get_prop_dparam_size()[type];
    int layout = corenrn.get_mech_data_layout()[type];
    tml->ml = (Memb_list*) ecalloc(1, sizeof(Memb_list));
    tml->ml->nodecount = 1;
    tml->ml->_nodecount_padded = tml->ml->nodecount;
    tml->ml->nodeindices = nullptr;
    tml->ml->data = (double*) ecalloc(tml->ml->nodecount * psize, sizeof(double));
    tml->ml->pdata = (Datum*) ecalloc(nrn_soa_padded_size(tml->ml->nodecount, layout) * dsize,
                                      sizeof(Datum));
    tml->ml->_thread = nullptr;
    tml->ml->_net_receive_buffer = nullptr;
    tml->ml->_net_send_buffer = nullptr;
    tml->ml->_permute = nullptr;

    if (auto* const priv_ctor = corenrn.get_memb_func(tml->index).private_constructor) {
        priv_ctor(nt, tml->ml, tml->index);
    }

    return tml;
}

// Opportunistically implemented to create a single PatternStim.
// So only does enough to get that functionally incorporated into the model
// and other types may require additional work. In particular, we
// append a new NrnThreadMembList with one item to the thread 0 tml list
// in order for the artificial cell to get its INITIAL block called but
// we do not modify any of the other thread 0 data arrays or counts.

Point_process* nrn_artcell_instantiate(const char* mechname) {
    int type = nrn_get_mechtype(mechname);
    NrnThread* nt = nrn_threads + 0;

    // printf("nrn_artcell_instantiate %s type=%d\n", mechname, type);

    // create and append to nt.tml
    auto tml = alloc_nrn_thread_memb(nt, type);

    assert(nt->_ml_list[type] == nullptr);  // FIXME
    nt->_ml_list[type] = tml->ml;

    if (!nt->tml) {
        nt->tml = tml;
    } else {
        for (NrnThreadMembList* i = nt->tml; i; i = i->next) {
            if (!i->next) {
                i->next = tml;
                break;
            }
        }
    }

    // Here we have a problem with no easy general solution. ml->pdata are
    // integer indexes into the nt->_data nt->_idata and nt->_vdata array
    // depending on context,
    // but nrn_setup.cpp allocated these to exactly have the size needed by
    // the file defined model (at least for _vdata) and so there are no slots
    // for pdata to index into for this new instance.
    // So nrn_setup.cpp:phase2 needs to
    // be notified that some extra space will be required. For now, defer
    // the general situation of several instances for several types and
    // demand that this method is never called more than once. We introduce
    // a int nrn_extra_thread0_vdata (only that is needed by PatternStim)
    //  which will be used by
    // nrn_setup.cpp:phase2 to allocate the appropriately larger
    // _vdata arrays for thread 0 (without changing _nvdata so
    // that we can fill in the indices here)
    static int cnt = 0;
    if (++cnt > 1) {
        printf("nrn_artcell_instantiate cannot be called more than once\n");
        assert(0);
    }
    // note that PatternStim internal usage for the 4 ppvar values  is:
    // #define _nd_area  _nt->_data[_ppvar[0]]  (not used since ARTIFICIAL_CELL)
    // #define _p_ptr  _nt->_vdata[_ppvar[2]] (the BBCORE_POINTER)
    // #define _tqitem &(_nt->_vdata[_ppvar[3]]) (for net_send)
    // and general external usage is:
    // _nt->_vdata[_ppvar[1]] = Point_process*
    //

    Point_process* pnt = new Point_process;
    pnt->_type = type;
    pnt->_tid = nt->id;
    pnt->_i_instance = 0;
    // as though all dparam index into _vdata
    int dsize = corenrn.get_prop_dparam_size()[type];
    assert(dsize <= nrn_extra_thread0_vdata);
    for (int i = 0; i < dsize; ++i) {
        tml->ml->pdata[i] = nt->_nvdata + i;
    }
    nt->_vdata[nt->_nvdata + 1] = (void*) pnt;

    return pnt;
}
}  // namespace coreneuron


================================================
FILE: coreneuron/mechanism/register_mech.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#include <cstring>

#include "coreneuron/nrnconf.h"
#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/membrane_definitions.h"
#include "coreneuron/mechanism/eion.hpp"
#include "coreneuron/mechanism/mech_mapping.hpp"
#include "coreneuron/mechanism/membfunc.hpp"
#include "coreneuron/coreneuron.hpp"
#include "coreneuron/utils/nrnoc_aux.hpp"

namespace coreneuron {
int secondorder = 0;
double t, dt, celsius, pi;
int rev_dt;

using Pfrv = void (*)();

static void ion_write_depend(int type, int etype);

void hoc_reg_bbcore_read(int type, bbcore_read_t f) {
    if (type == -1) {
        return;
    }
    corenrn.get_bbcore_read()[type] = f;
}
void hoc_reg_bbcore_write(int type, bbcore_write_t f) {
    if (type == -1) {
        return;
    }
    corenrn.get_bbcore_write()[type] = f;
}

void add_nrn_has_net_event(int type) {
    if (type == -1) {
        return;
    }
    corenrn.get_has_net_event().push_back(type);
}

/* values are type numbers of mechanisms which have FOR_NETCONS statement */
int nrn_fornetcon_cnt_;    /* how many models have a FOR_NETCONS statement */
int* nrn_fornetcon_type_;  /* what are the type numbers */
int* nrn_fornetcon_index_; /* what is the index into the ppvar array */

void add_nrn_fornetcons(int type, int indx) {
    if (type == -1)
        return;

    int i = nrn_fornetcon_cnt_++;
    nrn_fornetcon_type_ = (int*) erealloc(nrn_fornetcon_type_, (i + 1) * sizeof(int));
    nrn_fornetcon_index_ = (int*) erealloc(nrn_fornetcon_index_, (i + 1) * sizeof(int));
    nrn_fornetcon_type_[i] = type;
    nrn_fornetcon_index_[i] = indx;
}

void add_nrn_artcell(int type, int qi) {
    if (type == -1) {
        return;
    }

    corenrn.get_is_artificial()[type] = 1;
    corenrn.get_artcell_qindex()[type] = qi;
}

void set_pnt_receive(int type,
                     pnt_receive_t pnt_receive,
                     pnt_receive_t pnt_receive_init,
                     short size) {
    if (type == -1) {
        return;
    }
    corenrn.get_pnt_receive()[type] = pnt_receive;
    corenrn.get_pnt_receive_init()[type] = pnt_receive_init;
    corenrn.get_pnt_receive_size()[type] = size;
}

void alloc_mech(int memb_func_size_) {
    corenrn.get_memb_funcs().resize(memb_func_size_);
    corenrn.get_pnt_map().resize(memb_func_size_);
    corenrn.get_pnt_receive().resize(memb_func_size_);
    corenrn.get_pnt_receive_init().resize(memb_func_size_);
    corenrn.get_pnt_receive_size().resize(memb_func_size_);
    corenrn.get_watch_check().resize(memb_func_size_);
    corenrn.get_is_artificial().resize(memb_func_size_, false);
    corenrn.get_artcell_qindex().resize(memb_func_size_);
    corenrn.get_prop_param_size().resize(memb_func_size_);
    corenrn.get_prop_dparam_size().resize(memb_func_size_);
    corenrn.get_mech_data_layout().resize(memb_func_size_, 1);
    corenrn.get_bbcore_read().resize(memb_func_size_);
    corenrn.get_bbcore_write().resize(memb_func_size_);
}

void initnrn() {
    secondorder = DEF_secondorder; /* >0 means crank-nicolson. 2 means currents
                              adjusted to t+dt/2 */
    t = 0.;                        /* msec */
    dt = DEF_dt;                   /* msec */
    rev_dt = (int) (DEF_rev_dt);   /* 1/msec */
    celsius = DEF_celsius;         /* degrees celsius */
}

/* if vectorized then thread_data_size added to it */
int register_mech(const char** m,
                  mod_alloc_t alloc,
                  mod_f_t cur,
                  mod_f_t jacob,
                  mod_f_t stat,
                  mod_f_t initialize,
                  mod_f_t private_constructor,
                  mod_f_t private_destructor,
                  int /* nrnpointerindex */,
                  int vectorized) {
    auto& memb_func = corenrn.get_memb_funcs();

    int type = nrn_get_mechtype(m[1]);

    // No mechanism in the .dat files
    if (type == -1)
        return type;

    assert(type);
#ifdef DEBUG
    printf("register_mech %s %d\n", m[1], type);
#endif
    if (memb_func[type].sym) {
        assert(strcmp(memb_func[type].sym, m[1]) == 0);
    } else {
        memb_func[type].sym = (char*) emalloc(strlen(m[1]) + 1);
        strcpy(memb_func[type].sym, m[1]);
    }
    memb_func[type].current = cur;
    memb_func[type].jacob = jacob;
    memb_func[type].alloc = alloc;
    memb_func[type].state = stat;
    memb_func[type].initialize = initialize;
    memb_func[type].constructor = nullptr;
    memb_func[type].destructor = nullptr;
    memb_func[type].private_constructor = private_constructor;
    memb_func[type].private_destructor = private_destructor;
#if VECTORIZE
    memb_func[type].vectorized = vectorized ? 1 : 0;
    memb_func[type].thread_size_ = vectorized ? (vectorized - 1) : 0;
    memb_func[type].thread_mem_init_ = nullptr;
    memb_func[type].thread_cleanup_ = nullptr;
    memb_func[type].thread_table_check_ = nullptr;
    memb_func[type].is_point = 0;
    memb_func[type].setdata_ = nullptr;
    memb_func[type].dparam_semantics = nullptr;
#endif
    register_all_variables_offsets(type, &m[2]);
    return type;
}

void nrn_writes_conc(int type, int /* unused */) {
    static int lastion = EXTRACELL + 1;
    if (type == -1)
        return;

#if CORENRN_DEBUG
    printf("%s reordered from %d to %d\n", corenrn.get_memb_func(type).sym, type, lastion);
#endif
    if (nrn_is_ion(type)) {
        ++lastion;
    }
}

void _nrn_layout_reg(int type, int layout) {
    corenrn.get_mech_data_layout()[type] = layout;
}

void hoc_register_net_receive_buffering(NetBufReceive_t f, int type) {
    corenrn.get_net_buf_receive().emplace_back(f, type);
}

void hoc_register_net_send_buffering(int type) {
    corenrn.get_net_buf_send_type().push_back(type);
}

void hoc_register_watch_check(nrn_watch_check_t nwc, int type) {
    corenrn.get_watch_check()[type] = nwc;
}

void hoc_register_prop_size(int type, int psize, int dpsize) {
    if (type == -1)
        return;

    int pold = corenrn.get_prop_param_size()[type];
    int dpold = corenrn.get_prop_dparam_size()[type];
    if (psize != pold || dpsize != dpold) {
        corenrn.get_different_mechanism_type().push_back(type);
    }
    corenrn.get_prop_param_size()[type] = psize;
    corenrn.get_prop_dparam_size()[type] = dpsize;
    if (dpsize) {
        corenrn.get_memb_func(type).dparam_semantics = (int*) ecalloc(dpsize, sizeof(int));
    }
}
void hoc_register_dparam_semantics(int type, int ix, const char* name) {
    /* needed for SoA to possibly reorder name_ion and some "pointer" pointers. */
    /* only interested in area, iontype, cvode_ieq,
       netsend, pointer, pntproc, bbcorepointer, watch, diam, fornetcon
       xx_ion and #xx_ion which will get
       a semantics value of -1, -2, -3,
       -4, -5, -6, -7, -8, -9, -10,
       type, and type+1000 respectively
    */
    auto& memb_func = corenrn.get_memb_funcs();
    if (strcmp(name, "area") == 0) {
        memb_func[type].dparam_semantics[ix] = -1;
    } else if (strcmp(name, "iontype") == 0) {
        memb_func[type].dparam_semantics[ix] = -2;
    } else if (strcmp(name, "cvodeieq") == 0) {
        memb_func[type].dparam_semantics[ix] = -3;
    } else if (strcmp(name, "netsend") == 0) {
        memb_func[type].dparam_semantics[ix] = -4;
    } else if (strcmp(name, "pointer") == 0) {
        memb_func[type].dparam_semantics[ix] = -5;
    } else if (strcmp(name, "pntproc") == 0) {
        memb_func[type].dparam_semantics[ix] = -6;
    } else if (strcmp(name, "bbcorepointer") == 0) {
        memb_func[type].dparam_semantics[ix] = -7;
    } else if (strcmp(name, "watch") == 0) {
        memb_func[type].dparam_semantics[ix] = -8;
    } else if (strcmp(name, "diam") == 0) {
        memb_func[type].dparam_semantics[ix] = -9;
    } else if (strcmp(name, "fornetcon") == 0) {
        memb_func[type].dparam_semantics[ix] = -10;
    } else {
        int i = name[0] == '#' ? 1 : 0;
        int etype = nrn_get_mechtype(name + i);
        memb_func[type].dparam_semantics[ix] = etype + i * 1000;
        /* note that if style is needed (i==1), then we are writing a concentration */
        if (i) {
            ion_write_depend(type, etype);
        }
    }
#if CORENRN_DEBUG
    printf("dparam semantics %s ix=%d %s %d\n",
           memb_func[type].sym,
           ix,
           name,
           memb_func[type].dparam_semantics[ix]);
#endif
}

/* only ion type ion_write_depend_ are non-nullptr */
/* and those are array of integers with first integer being array size */
/* and remaining size-1 integers containing the mechanism types that write concentrations to that
 * ion */
static void ion_write_depend(int type, int etype) {
    auto& memb_func = corenrn.get_memb_funcs();
    auto& ion_write_depend_ = corenrn.get_ion_write_dependency();
    if (ion_write_depend_.size() < memb_func.size()) {
        ion_write_depend_.resize(memb_func.size());
    }

    int size = !ion_write_depend_[etype].empty() ? ion_write_depend_[etype][0] + 1 : 2;

    ion_write_depend_[etype].resize(size, 0);
    ion_write_depend_[etype][0] = size;
    ion_write_depend_[etype][size - 1] = type;
}

static int depend_append(int idep, int* dependencies, int deptype, int type) {
    /* append only if not already in dependencies and != type*/
    bool add = true;
    if (deptype == type) {
        return idep;
    }
    for (int i = 0; i < idep; ++i) {
        if (deptype == dependencies[i]) {
            add = false;
            break;
        }
    }
    if (add) {
        dependencies[idep++] = deptype;
    }
    return idep;
}

/* return list of types that this type depends on (10 should be more than enough) */
/* dependencies must be an array that is large enough to hold that array */
/* number of dependencies is returned */
int nrn_mech_depend(int type, int* dependencies) {
    int dpsize = corenrn.get_prop_dparam_size()[type];
    int* ds = corenrn.get_memb_func(type).dparam_semantics;
    int idep = 0;
    if (ds)
        for (int i = 0; i < dpsize; ++i) {
            if (ds[i] > 0 && ds[i] < 1000) {
                int deptype = ds[i];
                int idepnew = depend_append(idep, dependencies, deptype, type);
                if ((idepnew > idep) && !corenrn.get_ion_write_dependency().empty() &&
                    !corenrn.get_ion_write_dependency()[deptype].empty()) {
                    auto& iwd = corenrn.get_ion_write_dependency()[deptype];
                    int size = iwd[0];
                    for (int j = 1; j < size; ++j) {
                        idepnew = depend_append(idepnew, dependencies, iwd[j], type);
                    }
                }
                idep = idepnew;
            }
        }
    return idep;
}

void register_constructor(mod_f_t c) {
    corenrn.get_memb_funcs().back().constructor = c;
}

void register_destructor(mod_f_t d) {
    corenrn.get_memb_funcs().back().destructor = d;
}

int point_reg_helper(const Symbol* s2) {
    static int next_pointtype = 1; /* starts at 1 since 0 means not point in pnt_map */
    int type = nrn_get_mechtype(s2);

    // No mechanism in the .dat files
    if (type == -1)
        return type;

    corenrn.get_pnt_map()[type] = next_pointtype++;
    corenrn.get_memb_func(type).is_point = 1;

    return corenrn.get_pnt_map()[type];
}

int point_register_mech(const char** m,
                        mod_alloc_t alloc,
                        mod_f_t cur,
                        mod_f_t jacob,
                        mod_f_t stat,
                        mod_f_t initialize,
                        mod_f_t private_constructor,
                        mod_f_t private_destructor,
                        int nrnpointerindex,
                        mod_f_t constructor,
                        mod_f_t destructor,
                        int vectorized) {
    const Symbol* s = m[1];
    register_mech(m,
                  alloc,
                  cur,
                  jacob,
                  stat,
                  initialize,
                  private_constructor,
                  private_destructor,
                  nrnpointerindex,
                  vectorized);
    register_constructor(constructor);
    register_destructor(destructor);
    return point_reg_helper(s);
}

void _modl_cleanup() {}

int state_discon_allowed_;
int state_discon_flag_ = 0;
void state_discontinuity(int /* i */, double* pd, double d) {
    if (state_discon_allowed_ && state_discon_flag_ == 0) {
        *pd = d;
        /*printf("state_discontinuity t=%g pd=%lx d=%g\n", t, (long)pd, d);*/
    }
}

void hoc_reg_ba(int mt, mod_f_t f, int type) {
    if (type == -1)
        return;

    switch (type) { /* see bablk in src/nmodl/nocpout.c */
        case 11:
            type = BEFORE_BREAKPOINT;
            break;
        case 22:
            type = AFTER_SOLVE;
            break;
        case 13:
            type = BEFORE_INITIAL;
            break;
        case 23:
            type = AFTER_INITIAL;
            break;
        case 14:
            type = BEFORE_STEP;
            break;
        default:
            printf("before-after processing type %d for %s not implemented\n",
                   type,
                   corenrn.get_memb_func(mt).sym);
            nrn_exit(1);
    }
    auto bam = (BAMech*) emalloc(sizeof(BAMech));
    bam->f = f;
    bam->type = mt;
    bam->next = nullptr;
    // keep in call order
    if (!corenrn.get_bamech()[type]) {
        corenrn.get_bamech()[type] = bam;
    } else {
        BAMech* last;
        for (last = corenrn.get_bamech()[type]; last->next; last = last->next) {
        }
        last->next = bam;
    }
}

void _nrn_thread_reg0(int i, void (*f)(ThreadDatum*)) {
    if (i == -1)
        return;

    corenrn.get_memb_func(i).thread_cleanup_ = f;
}

void _nrn_thread_reg1(int i, void (*f)(ThreadDatum*)) {
    if (i == -1)
        return;

    corenrn.get_memb_func(i).thread_mem_init_ = f;
}

void _nrn_thread_table_reg(int i, thread_table_check_t f) {
    if (i == -1)
        return;

    corenrn.get_memb_func(i).thread_table_check_ = f;
}

void _nrn_setdata_reg(int i, void (*call)(double*, Datum*)) {
    if (i == -1)
        return;

    corenrn.get_memb_func(i).setdata_ = call;
}

Memb_func::~Memb_func() {
    if (sym != nullptr) {
        free(sym);
    }
    if (dparam_semantics != nullptr) {
        free(dparam_semantics);
    }
}

}  // namespace coreneuron


================================================
FILE: coreneuron/mechanism/register_mech.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/
#pragma once

namespace coreneuron {
void add_nrn_artcell(int type, int qi);
void set_pnt_receive(int type,
                     pnt_receive_t pnt_receive,
                     pnt_receive_t pnt_receive_init,
                     short size);
extern void initnrn(void);
extern void hoc_reg_bbcore_read(int type, bbcore_read_t f);
extern void hoc_reg_bbcore_write(int type, bbcore_write_t f);
extern void _nrn_thread_table_reg(
    int i,
    void (*f)(int, int, double*, Datum*, ThreadDatum*, NrnThread*, Memb_list*, int));
extern void alloc_mech(int);

}  // namespace coreneuron


================================================
FILE: coreneuron/membrane_definitions.h
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

/* /local/src/master/nrn/src/nrnoc/membdef.h,v 1.2 1995/02/13 20:20:42 hines Exp */

/* numerical parameters */
#define DEF_nseg   1           /* default number of segments per section*/
#define DEF_dt     .025        /* ms */
#define DEF_rev_dt 1. / DEF_dt /* 1/ms */
#define DEF_secondorder                           \
    0 /* >0 means crank-nicolson. 2 means current \
      adjusted to t+dt/2 */

/*global parameters */
#define DEF_Ra      35.4 /* ohm-cm */ /*changed from 34.5 on 1/6/95*/
#define DEF_celsius 6.3               /* deg-C */

#define DEF_vrest -65. /* mV */

/* old point process parameters */
/* fclamp */
#define DEF_clamp_resist 1e-3 /* megohm */

/* Parameters that are used in mechanism _alloc() procedures */
/* cable */
#define DEF_L          100. /* microns */
#define DEF_rallbranch 1.

/* morphology */
#define DEF_diam 500. /* microns */

/* capacitance */
#define DEF_cm 1. /* uF/cm^2 */

/* fast passive (e_p and g_p)*/
#define DEF_e DEF_vrest /* mV */
#define DEF_g 5.e-4     /* S/cm^2 */

/* na_ion */
#define DEF_nai 10.                /* mM */
#define DEF_nao 140.               /* mM */
#define DEF_ena (115. + DEF_vrest) /* mV */

/* k_ion */
#define DEF_ki 54.4               /* mM */
#define DEF_ko 2.5                /* mM */
#define DEF_ek (-12. + DEF_vrest) /* mV */

/* ca_ion -> any program that uses DEF_eca must include <math.h> */
#define DEF_cai 5.e-5 /* mM */
#define DEF_cao 2.    /* mM */
#include <math.h>
#define DEF_eca 12.5 * log(DEF_cao / DEF_cai) /* mV */

/* default ion values */
#define DEF_ioni 1. /* mM */
#define DEF_iono 1. /* mM */
#define DEF_eion 0. /* mV */


================================================
FILE: coreneuron/mpi/core/nrnmpi.hpp
================================================
#pragma once

namespace coreneuron {
extern int nrnmpi_numprocs;
extern int nrnmpi_myid;
}  // namespace coreneuron


================================================
FILE: coreneuron/mpi/core/nrnmpi_def_cinc.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

namespace coreneuron {
int nrnmpi_numprocs = 1; /* size */
int nrnmpi_myid = 0;     /* rank */
}  // namespace coreneuron


================================================
FILE: coreneuron/mpi/core/nrnmpidec.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#include "../nrnmpi.h"

namespace coreneuron {


/* from nrnmpi.cpp */
mpi_function<cnrn_make_integral_constant_t(nrnmpi_init_impl)> nrnmpi_init{"nrnmpi_init_impl"};
mpi_function<cnrn_make_integral_constant_t(nrnmpi_finalize_impl)> nrnmpi_finalize{
    "nrnmpi_finalize_impl"};
mpi_function<cnrn_make_integral_constant_t(nrnmpi_check_threading_support_impl)>
    nrnmpi_check_threading_support{"nrnmpi_check_threading_support_impl"};
mpi_function<cnrn_make_integral_constant_t(nrnmpi_write_file_impl)> nrnmpi_write_file{
    "nrnmpi_write_file_impl"};

/* from mpispike.c */
mpi_function<cnrn_make_integral_constant_t(nrnmpi_spike_exchange_impl)> nrnmpi_spike_exchange{
    "nrnmpi_spike_exchange_impl"};
mpi_function<cnrn_make_integral_constant_t(nrnmpi_spike_exchange_compressed_impl)>
    nrnmpi_spike_exchange_compressed{"nrnmpi_spike_exchange_compressed_impl"};
mpi_function<cnrn_make_integral_constant_t(nrnmpi_int_allmax_impl)> nrnmpi_int_allmax{
    "nrnmpi_int_allmax_impl"};
mpi_function<cnrn_make_integral_constant_t(nrnmpi_int_allgather_impl)> nrnmpi_int_allgather{
    "nrnmpi_int_allgather_impl"};
mpi_function<cnrn_make_integral_constant_t(nrnmpi_int_alltoall_impl)> nrnmpi_int_alltoall{
    "nrnmpi_int_alltoall_impl"};
mpi_function<cnrn_make_integral_constant_t(nrnmpi_int_alltoallv_impl)> nrnmpi_int_alltoallv{
    "nrnmpi_int_alltoallv_impl"};
mpi_function<cnrn_make_integral_constant_t(nrnmpi_dbl_alltoallv_impl)> nrnmpi_dbl_alltoallv{
    "nrnmpi_dbl_alltoallv_impl"};
mpi_function<cnrn_make_integral_constant_t(nrnmpi_dbl_allmin_impl)> nrnmpi_dbl_allmin{
    "nrnmpi_dbl_allmin_impl"};
mpi_function<cnrn_make_integral_constant_t(nrnmpi_dbl_allmax_impl)> nrnmpi_dbl_allmax{
    "nrnmpi_dbl_allmax_impl"};
mpi_function<cnrn_make_integral_constant_t(nrnmpi_barrier_impl)> nrnmpi_barrier{
    "nrnmpi_barrier_impl"};
mpi_function<cnrn_make_integral_constant_t(nrnmpi_dbl_allreduce_impl)> nrnmpi_dbl_allreduce{
    "nrnmpi_dbl_allreduce_impl"};
mpi_function<cnrn_make_integral_constant_t(nrnmpi_dbl_allreduce_vec_impl)> nrnmpi_dbl_allreduce_vec{
    "nrnmpi_dbl_allreduce_vec_impl"};
mpi_function<cnrn_make_integral_constant_t(nrnmpi_long_allreduce_vec_impl)>
    nrnmpi_long_allreduce_vec{"nrnmpi_long_allreduce_vec_impl"};
mpi_function<cnrn_make_integral_constant_t(nrnmpi_initialized_impl)> nrnmpi_initialized{
    "nrnmpi_initialized_impl"};
mpi_function<cnrn_make_integral_constant_t(nrnmpi_abort_impl)> nrnmpi_abort{"nrnmpi_abort_impl"};
mpi_function<cnrn_make_integral_constant_t(nrnmpi_wtime_impl)> nrnmpi_wtime{"nrnmpi_wtime_impl"};
mpi_function<cnrn_make_integral_constant_t(nrnmpi_local_rank_impl)> nrnmpi_local_rank{
    "nrnmpi_local_rank_impl"};
mpi_function<cnrn_make_integral_constant_t(nrnmpi_local_size_impl)> nrnmpi_local_size{
    "nrnmpi_local_size_impl"};
#if NRN_MULTISEND
mpi_function<cnrn_make_integral_constant_t(nrnmpi_multisend_comm_impl)> nrnmpi_multisend_comm{
    "nrnmpi_multisend_comm_impl"};
mpi_function<cnrn_make_integral_constant_t(nrnmpi_multisend_impl)> nrnmpi_multisend{
    "nrnmpi_multisend_impl"};
mpi_function<cnrn_make_integral_constant_t(nrnmpi_multisend_single_advance_impl)>
    nrnmpi_multisend_single_advance{"nrnmpi_multisend_single_advance_impl"};
mpi_function<cnrn_make_integral_constant_t(nrnmpi_multisend_conserve_impl)>
    nrnmpi_multisend_conserve{"nrnmpi_multisend_conserve_impl"};
#endif  // NRN_MULTISEND

}  // namespace coreneuron


================================================
FILE: coreneuron/mpi/core/resolve.cpp
================================================
#include <dlfcn.h>
#include <sstream>
#include "../nrnmpi.h"

namespace coreneuron {
// Those functions are part of a mechanism to dynamically load mpi or not
void mpi_manager_t::resolve_symbols(void* handle) {
    for (auto* ptr: m_function_ptrs) {
        assert(!(*ptr));
        ptr->resolve(handle);
        assert(*ptr);
    }
}

void mpi_function_base::resolve(void* handle) {
    dlerror();
    void* ptr = dlsym(handle, m_name);
    const char* error = dlerror();
    if (error) {
        std::ostringstream oss;
        oss << "Could not get symbol " << m_name << " from handle " << handle << ": " << error;
        throw std::runtime_error(oss.str());
    }
    assert(ptr);
    m_fptr = ptr;
}
}  // namespace coreneuron


================================================
FILE: coreneuron/mpi/lib/mpispike.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#include "coreneuron/nrnconf.h"
/* do not want the redef in the dynamic load case */
#include "coreneuron/mpi/nrnmpiuse.h"
#include "coreneuron/mpi/nrnmpi.h"
#include "coreneuron/mpi/nrnmpidec.h"
#include "nrnmpi.hpp"
#include "coreneuron/utils/profile/profiler_interface.h"
#include "coreneuron/utils/nrn_assert.h"

#include <mpi.h>

#include <cstring>

namespace coreneuron {
extern MPI_Comm nrnmpi_comm;

static int np;
static int* displs{nullptr};
static int* byteovfl{nullptr}; /* for the compressed transfer method */
static MPI_Datatype spike_type;

static void* emalloc(size_t size) {
    void* memptr = malloc(size);
    assert(memptr);
    return memptr;
}

// Register type NRNMPI_Spike
void nrnmpi_spike_initialize() {
    NRNMPI_Spike s;
    int block_lengths[2] = {1, 1};
    MPI_Aint addresses[3];

    MPI_Get_address(&s, &addresses[0]);
    MPI_Get_address(&(s.gid), &addresses[1]);
    MPI_Get_address(&(s.spiketime), &addresses[2]);

    MPI_Aint displacements[2] = {addresses[1] - addresses[0], addresses[2] - addresses[0]};

    MPI_Datatype typelist[2] = {MPI_INT, MPI_DOUBLE};
    MPI_Type_create_struct(2, block_lengths, displacements, typelist, &spike_type);
    MPI_Type_commit(&spike_type);
}

#if nrn_spikebuf_size > 0

static MPI_Datatype spikebuf_type;

// Register type NRNMPI_Spikebuf
static void make_spikebuf_type() {
    NRNMPI_Spikebuf s;
    int block_lengths[3] = {1, nrn_spikebuf_size, nrn_spikebuf_size};
    MPI_Datatype typelist[3] = {MPI_INT, MPI_INT, MPI_DOUBLE};

    MPI_Aint addresses[4];
    MPI_Get_address(&s, &addresses[0]);
    MPI_Get_address(&(s.nspike), &addresses[1]);
    MPI_Get_address(&(s.gid[0]), &addresses[2]);
    MPI_Get_address(&(s.spiketime[0]), &addresses[3]);

    MPI_Aint displacements[3] = {addresses[1] - addresses[0],
                                 addresses[2] - addresses[0],
                                 addresses[3] - addresses[0]};

    MPI_Type_create_struct(3, block_lengths, displacements, typelist, &spikebuf_type);
    MPI_Type_commit(&spikebuf_type);
}
#endif

void wait_before_spike_exchange() {
    MPI_Barrier(nrnmpi_comm);
}

int nrnmpi_spike_exchange_impl(int* nin,
                               NRNMPI_Spike* spikeout,
                               int icapacity,
                               NRNMPI_Spike** spikein,
                               int& ovfl,
                               int nout,
                               NRNMPI_Spikebuf* spbufout,
                               NRNMPI_Spikebuf* spbufin) {
    nrn_assert(spikein);
    Instrumentor::phase_begin("spike-exchange");

    {
        Instrumentor::phase p("imbalance");
        wait_before_spike_exchange();
    }

    Instrumentor::phase_begin("communication");
    if (!displs) {
        np = nrnmpi_numprocs_;
        displs = (int*) emalloc(np * sizeof(int));
        displs[0] = 0;
#if nrn_spikebuf_size > 0
        make_spikebuf_type();
#endif
    }
#if nrn_spikebuf_size == 0
    MPI_Allgather(&nout, 1, MPI_INT, nin, 1, MPI_INT, nrnmpi_comm);
    int n = nin[0];
    for (int i = 1; i < np; ++i) {
        displs[i] = n;
        n += nin[i];
    }
    if (n) {
        if (icapacity < n) {
            icapacity = n + 10;
            free(*spikein);
            *spikein = (NRNMPI_Spike*) emalloc(icapacity * sizeof(NRNMPI_Spike));
        }
        MPI_Allgatherv(spikeout, nout, spike_type, *spikein, nin, displs, spike_type, nrnmpi_comm);
    }
#else
    MPI_Allgather(spbufout, 1, spikebuf_type, spbufin, 1, spikebuf_type, nrnmpi_comm);
    int novfl = 0;
    int n = spbufin[0].nspike;
    if (n > nrn_spikebuf_size) {
        nin[0] = n - nrn_spikebuf_size;
        novfl += nin[0];
    } else {
        nin[0] = 0;
    }
    for (int i = 1; i < np; ++i) {
        displs[i] = novfl;
        int n1 = spbufin[i].nspike;
        n += n1;
        if (n1 > nrn_spikebuf_size) {
            nin[i] = n1 - nrn_spikebuf_size;
            novfl += nin[i];
        } else {
            nin[i] = 0;
        }
    }
    if (novfl) {
        if (icapacity < novfl) {
            icapacity = novfl + 10;
            free(*spikein);
            *spikein = (NRNMPI_Spike*) emalloc(icapacity * sizeof(NRNMPI_Spike));
        }
        int n1 = (nout > nrn_spikebuf_size) ? nout - nrn_spikebuf_size : 0;
        MPI_Allgatherv(spikeout, n1, spike_type, *spikein, nin, displs, spike_type, nrnmpi_comm);
    }
    ovfl = novfl;
#endif
    Instrumentor::phase_end("communication");
    Instrumentor::phase_end("spike-exchange");
    return n;
}

/*
The compressed spike format is restricted to the fixed step method and is
a sequence of unsigned char.
nspike = buf[0]*256 + buf[1]
a sequence of spiketime, localgid pairs. There are nspike of them.
        spiketime is relative to the last transfer time in units of dt.
        note that this requires a mindelay < 256*dt.
        localgid is an unsigned int, unsigned short,
        or unsigned char in size depending on the range and thus takes
        4, 2, or 1 byte respectively. To be machine independent we do our
        own byte coding. When the localgid range is smaller than the true
        gid range, the gid->PreSyn are remapped into
        hostid specific	maps. If there are not many holes, i.e just about every
        spike from a source machine is delivered to some cell on a
        target machine, then instead of	a hash map, a vector is used.
The allgather sends the first part of the buf and the allgatherv buffer
sends any overflow.
*/
int nrnmpi_spike_exchange_compressed_impl(int localgid_size,
                                          unsigned char*& spfixin_ovfl,
                                          int send_nspike,
                                          int* nin,
                                          int ovfl_capacity,
                                          unsigned char* spikeout_fixed,
                                          int ag_send_size,
                                          unsigned char* spikein_fixed,
                                          int& ovfl) {
    if (!displs) {
        np = nrnmpi_numprocs_;
        displs = (int*) emalloc(np * sizeof(int));
        displs[0] = 0;
    }
    if (!byteovfl) {
        byteovfl = (int*) emalloc(np * sizeof(int));
    }
    MPI_Allgather(
        spikeout_fixed, ag_send_size, MPI_BYTE, spikein_fixed, ag_send_size, MPI_BYTE, nrnmpi_comm);
    int novfl = 0;
    int ntot = 0;
    int bstot = 0;
    for (int i = 0; i < np; ++i) {
        displs[i] = bstot;
        int idx = i * ag_send_size;
        int n = spikein_fixed[idx++] * 256;
        n += spikein_fixed[idx++];
        ntot += n;
        nin[i] = n;
        if (n > send_nspike) {
            int bs = 2 + n * (1 + localgid_size) - ag_send_size;
            byteovfl[i] = bs;
            bstot += bs;
            novfl += n - send_nspike;
        } else {
            byteovfl[i] = 0;
        }
    }
    if (novfl) {
        if (ovfl_capacity < novfl) {
            ovfl_capacity = novfl + 10;
            free(spfixin_ovfl);
            spfixin_ovfl = (unsigned char*) emalloc(ovfl_capacity * (1 + localgid_size) *
                                                    sizeof(unsigned char));
        }
        int bs = byteovfl[nrnmpi_myid_];
        /*
        note that the spikeout_fixed buffer is one since the overflow
        is contiguous to the first part. But the spfixin_ovfl is
        completely separate from the spikein_fixed since the latter
        dynamically changes its size during a run.
        */
        MPI_Allgatherv(spikeout_fixed + ag_send_size,
                       bs,
                       MPI_BYTE,
                       spfixin_ovfl,
                       byteovfl,
                       displs,
                       MPI_BYTE,
                       nrnmpi_comm);
    }
    ovfl = novfl;
    return ntot;
}

int nrnmpi_int_allmax_impl(int x) {
    int result;
    MPI_Allreduce(&x, &result, 1, MPI_INT, MPI_MAX, nrnmpi_comm);
    return result;
}

extern void nrnmpi_int_alltoall_impl(int* s, int* r, int n) {
    MPI_Alltoall(s, n, MPI_INT, r, n, MPI_INT, nrnmpi_comm);
}

extern void nrnmpi_int_alltoallv_impl(const int* s,
                                      const int* scnt,
                                      const int* sdispl,
                                      int* r,
                                      int* rcnt,
                                      int* rdispl) {
    MPI_Alltoallv(s, scnt, sdispl, MPI_INT, r, rcnt, rdispl, MPI_INT, nrnmpi_comm);
}

extern void nrnmpi_dbl_alltoallv_impl(double* s,
                                      int* scnt,
                                      int* sdispl,
                                      double* r,
                                      int* rcnt,
                                      int* rdispl) {
    MPI_Alltoallv(s, scnt, sdispl, MPI_DOUBLE, r, rcnt, rdispl, MPI_DOUBLE, nrnmpi_comm);
}

/* following are for the partrans */

void nrnmpi_int_allgather_impl(int* s, int* r, int n) {
    MPI_Allgather(s, n, MPI_INT, r, n, MPI_INT, nrnmpi_comm);
}

double nrnmpi_dbl_allmin_impl(double x) {
    double result;
    MPI_Allreduce(&x, &result, 1, MPI_DOUBLE, MPI_MIN, nrnmpi_comm);
    return result;
}

double nrnmpi_dbl_allmax_impl(double x) {
    double result;
    MPI_Allreduce(&x, &result, 1, MPI_DOUBLE, MPI_MAX, nrnmpi_comm);
    return result;
}

void nrnmpi_barrier_impl() {
    MPI_Barrier(nrnmpi_comm);
}

double nrnmpi_dbl_allreduce_impl(double x, int type) {
    double result;
    MPI_Op tt;
    if (type == 1) {
        tt = MPI_SUM;
    } else if (type == 2) {
        tt = MPI_MAX;
    } else {
        tt = MPI_MIN;
    }
    MPI_Allreduce(&x, &result, 1, MPI_DOUBLE, tt, nrnmpi_comm);
    return result;
}

void nrnmpi_dbl_allreduce_vec_impl(double* src, double* dest, int cnt, int type) {
    MPI_Op tt;
    assert(src != dest);
    if (type == 1) {
        tt = MPI_SUM;
    } else if (type == 2) {
        tt = MPI_MAX;
    } else {
        tt = MPI_MIN;
    }
    MPI_Allreduce(src, dest, cnt, MPI_DOUBLE, tt, nrnmpi_comm);
    return;
}

void nrnmpi_long_allreduce_vec_impl(long* src, long* dest, int cnt, int type) {
    MPI_Op tt;
    assert(src != dest);
    if (type == 1) {
        tt = MPI_SUM;
    } else if (type == 2) {
        tt = MPI_MAX;
    } else {
        tt = MPI_MIN;
    }
    MPI_Allreduce(src, dest, cnt, MPI_LONG, tt, nrnmpi_comm);
    return;
}

#if NRN_MULTISEND

static MPI_Comm multisend_comm;

void nrnmpi_multisend_comm_impl() {
    if (!multisend_comm) {
        MPI_Comm_dup(MPI_COMM_WORLD, &multisend_comm);
    }
}

void nrnmpi_multisend_impl(NRNMPI_Spike* spk, int n, int* hosts) {
    MPI_Request r;
    for (int i = 0; i < n; ++i) {
        MPI_Isend(spk, 1, spike_type, hosts[i], 1, multisend_comm, &r);
        MPI_Request_free(&r);
    }
}

int nrnmpi_multisend_single_advance_impl(NRNMPI_Spike* spk) {
    int flag = 0;
    MPI_Status status;
    MPI_Iprobe(MPI_ANY_SOURCE, 1, multisend_comm, &flag, &status);
    if (flag) {
        MPI_Recv(spk, 1, spike_type, MPI_ANY_SOURCE, 1, multisend_comm, &status);
    }
    return flag;
}

int nrnmpi_multisend_conserve_impl(int nsend, int nrecv) {
    int tcnts[2];
    tcnts[0] = nsend - nrecv;
    MPI_Allreduce(tcnts, tcnts + 1, 1, MPI_INT, MPI_SUM, multisend_comm);
    return tcnts[1];
}

#endif /*NRN_MULTISEND*/
}  // namespace coreneuron


================================================
FILE: coreneuron/mpi/lib/nrnmpi.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#include <iostream>
#include <string>
#include <tuple>

#include "coreneuron/nrnconf.h"
#include "coreneuron/mpi/nrnmpi.h"
#include "coreneuron/utils/nrn_assert.h"
#include "nrnmpi.hpp"
#if _OPENMP
#include <omp.h>
#endif
#include <mpi.h>
namespace coreneuron {

MPI_Comm nrnmpi_world_comm;
MPI_Comm nrnmpi_comm;
int nrnmpi_numprocs_;
int nrnmpi_myid_;

static bool nrnmpi_under_nrncontrol_{false};

static void nrn_fatal_error(const char* msg) {
    if (nrnmpi_myid_ == 0) {
        printf("%s\n", msg);
    }
    nrnmpi_abort_impl(-1);
}

nrnmpi_init_ret_t nrnmpi_init_impl(int* pargc, char*** pargv, bool is_quiet) {
    // Execute at most once per launch. Avoid memory leak.
    static bool executed = false;
    if (executed) {
        return {nrnmpi_numprocs_, nrnmpi_myid_};
    }

    nrnmpi_under_nrncontrol_ = true;

    if (!nrnmpi_initialized_impl()) {
#if defined(_OPENMP)
        int required = MPI_THREAD_FUNNELED;
        int provided;
        nrn_assert(MPI_Init_thread(pargc, pargv, required, &provided) == MPI_SUCCESS);

        nrn_assert(required <= provided);
#else
        nrn_assert(MPI_Init(pargc, pargv) == MPI_SUCCESS);
#endif
    }
    nrn_assert(MPI_Comm_dup(MPI_COMM_WORLD, &nrnmpi_world_comm) == MPI_SUCCESS);
    nrn_assert(MPI_Comm_dup(nrnmpi_world_comm, &nrnmpi_comm) == MPI_SUCCESS);
    nrn_assert(MPI_Comm_rank(nrnmpi_world_comm, &nrnmpi_myid_) == MPI_SUCCESS);
    nrn_assert(MPI_Comm_size(nrnmpi_world_comm, &nrnmpi_numprocs_) == MPI_SUCCESS);
    nrnmpi_spike_initialize();

    if (nrnmpi_myid_ == 0 && !is_quiet) {
#if defined(_OPENMP)
        printf(" num_mpi=%d\n num_omp_thread=%d\n\n", nrnmpi_numprocs_, omp_get_max_threads());
#else
        printf(" num_mpi=%d\n\n", nrnmpi_numprocs_);
#endif
    }

    executed = true;
    return {nrnmpi_numprocs_, nrnmpi_myid_};
}

void nrnmpi_finalize_impl(void) {
    if (nrnmpi_under_nrncontrol_) {
        if (nrnmpi_initialized_impl()) {
            MPI_Comm_free(&nrnmpi_world_comm);
            MPI_Comm_free(&nrnmpi_comm);
            MPI_Finalize();
        }
    }
}

// check if appropriate threading level supported (i.e. MPI_THREAD_FUNNELED)
void nrnmpi_check_threading_support_impl() {
    int th = 0;
    MPI_Query_thread(&th);
    if (th < MPI_THREAD_FUNNELED) {
        nrn_fatal_error(
            "\n Current MPI library doesn't support MPI_THREAD_FUNNELED,\
                    \n Run without enabling multi-threading!");
    }
}

bool nrnmpi_initialized_impl() {
    int flag = 0;
    MPI_Initialized(&flag);
    return flag != 0;
}

void nrnmpi_abort_impl(int errcode) {
    MPI_Abort(MPI_COMM_WORLD, errcode);
}

double nrnmpi_wtime_impl() {
    return MPI_Wtime();
}

/**
 * Return local mpi rank within a shared memory node
 *
 * When performing certain operations, we need to know the rank of mpi
 * process on a given node. This function uses MPI 3 MPI_Comm_split_type
 * function and MPI_COMM_TYPE_SHARED key to find out the local rank.
 */
int nrnmpi_local_rank_impl() {
    int local_rank = 0;
    if (nrnmpi_initialized_impl()) {
        MPI_Comm local_comm;
        MPI_Comm_split_type(
            MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, nrnmpi_myid_, MPI_INFO_NULL, &local_comm);
        MPI_Comm_rank(local_comm, &local_rank);
        MPI_Comm_free(&local_comm);
    }
    return local_rank;
}

/**
 * Return number of ranks running on single shared memory node
 *
 * We use MPI 3 MPI_Comm_split_type function and MPI_COMM_TYPE_SHARED key to
 * determine number of mpi ranks within a shared memory node.
 */
int nrnmpi_local_size_impl() {
    int local_size = 1;
    if (nrnmpi_initialized_impl()) {
        MPI_Comm local_comm;
        MPI_Comm_split_type(
            MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, nrnmpi_myid_, MPI_INFO_NULL, &local_comm);
        MPI_Comm_size(local_comm, &local_size);
        MPI_Comm_free(&local_comm);
    }
    return local_size;
}

/**
 * Write given buffer to a new file using MPI collective I/O
 *
 * For output like spikes, each rank has to write spike timing
 * information to a single file. This routine writes buffers
 * of length len1, len2, len3... at the offsets 0, 0+len1,
 * 0+len1+len2... offsets. This write op is a collective across
 * all ranks of the common MPI communicator used for spike exchange.
 *
 * @param filename Name of the file to write
 * @param buffer Buffer to write
 * @param length Length of the buffer to write
 */
void nrnmpi_write_file_impl(const std::string& filename, const char* buffer, size_t length) {
    MPI_File fh;
    MPI_Status status;

    // global offset into file
    unsigned long offset = 0;
    MPI_Exscan(&length, &offset, 1, MPI_UNSIGNED_LONG, MPI_SUM, nrnmpi_comm);

    int op_status = MPI_File_open(
        nrnmpi_comm, filename.c_str(), MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &fh);
    if (op_status != MPI_SUCCESS && nrnmpi_myid_ == 0) {
        std::cerr << "Error while opening output file " << filename << std::endl;
        abort();
    }

    op_status = MPI_File_write_at_all(fh, offset, buffer, length, MPI_BYTE, &status);
    if (op_status != MPI_SUCCESS && nrnmpi_myid_ == 0) {
        std::cerr << "Error while writing output " << std::endl;
        abort();
    }

    MPI_File_close(&fh);
}
}  // namespace coreneuron


================================================
FILE: coreneuron/mpi/lib/nrnmpi.hpp
================================================
#pragma once

// This file contains functions that does not go outside of the mpi library
namespace coreneuron {
extern int nrnmpi_numprocs_;
extern int nrnmpi_myid_;
void nrnmpi_spike_initialize();
}  // namespace coreneuron


================================================
FILE: coreneuron/mpi/nrnmpi.h
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#pragma once

#include <cassert>
#include <string>
#include <type_traits>
#include <vector>

#include "coreneuron/mpi/nrnmpiuse.h"

#ifndef nrn_spikebuf_size
#define nrn_spikebuf_size 0
#endif

namespace coreneuron {
struct NRNMPI_Spikebuf {
    int nspike;
    int gid[nrn_spikebuf_size];
    double spiketime[nrn_spikebuf_size];
};
}  // namespace coreneuron

namespace coreneuron {
struct NRNMPI_Spike {
    int gid;
    double spiketime;
};

// Those functions and classes are part of a mechanism to dynamically or statically load mpi
// functions
struct mpi_function_base;

struct mpi_manager_t {
    void register_function(mpi_function_base* ptr) {
        m_function_ptrs.push_back(ptr);
    }
    void resolve_symbols(void* dlsym_handle);

  private:
    std::vector<mpi_function_base*> m_function_ptrs;
    // true when symbols are resolved
};

inline mpi_manager_t& mpi_manager() {
    static mpi_manager_t x;
    return x;
}

struct mpi_function_base {
    void resolve(void* dlsym_handle);
    operator bool() const {
        return m_fptr;
    }
    mpi_function_base(const char* name)
        : m_name{name} {
        mpi_manager().register_function(this);
    }

  protected:
    void* m_fptr{};
    const char* m_name;
};

// This could be done with a simpler
//   template <auto fptr> struct function : function_base { ... };
// pattern in C++17...
template <typename>
struct mpi_function {};

#define cnrn_make_integral_constant_t(x) std::integral_constant<std::decay_t<decltype(x)>, x>

template <typename function_ptr, function_ptr fptr>
struct mpi_function<std::integral_constant<function_ptr, fptr>>: mpi_function_base {
    using mpi_function_base::mpi_function_base;
    template <typename... Args>  // in principle deducible from `function_ptr`
    auto operator()(Args&&... args) const {
#ifdef CORENEURON_ENABLE_MPI_DYNAMIC
        // Dynamic MPI, m_fptr should have been initialised via dlsym.
        assert(m_fptr);
        return (*reinterpret_cast<decltype(fptr)>(m_fptr))(std::forward<Args>(args)...);
#else
        // No dynamic MPI, use `fptr` directly. Will produce link errors if libmpi.so is not linked.
        return (*fptr)(std::forward<Args>(args)...);
#endif
    }
};
}  // namespace coreneuron
#include "coreneuron/mpi/nrnmpidec.h"


================================================
FILE: coreneuron/mpi/nrnmpidec.h
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

/*
This file is processed by mkdynam.sh and so it is important that
the prototypes be of the form "type foo(type arg, ...)"
*/

#pragma once

#include <stdlib.h>

namespace coreneuron {
/* from nrnmpi.cpp */
struct nrnmpi_init_ret_t {
    int numprocs;
    int myid;
};
extern "C" nrnmpi_init_ret_t nrnmpi_init_impl(int* pargc, char*** pargv, bool is_quiet);
extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_init_impl)> nrnmpi_init;
extern "C" void nrnmpi_finalize_impl(void);
extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_finalize_impl)> nrnmpi_finalize;
extern "C" void nrnmpi_check_threading_support_impl();
extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_check_threading_support_impl)>
    nrnmpi_check_threading_support;
// Write given buffer to a new file using MPI collective I/O
extern "C" void nrnmpi_write_file_impl(const std::string& filename,
                                       const char* buffer,
                                       size_t length);
extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_write_file_impl)> nrnmpi_write_file;


/* from mpispike.cpp */
extern "C" int nrnmpi_spike_exchange_impl(int* nin,
                                          NRNMPI_Spike* spikeout,
                                          int icapacity,
                                          NRNMPI_Spike** spikein,
                                          int& ovfl,
                                          int nout,
                                          NRNMPI_Spikebuf* spbufout,
                                          NRNMPI_Spikebuf* spbufin);
extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_spike_exchange_impl)>
    nrnmpi_spike_exchange;
extern "C" int nrnmpi_spike_exchange_compressed_impl(int,
                                                     unsigned char*&,
                                                     int,
                                                     int*,
                                                     int,
                                                     unsigned char*,
                                                     int,
                                                     unsigned char*,
                                                     int& ovfl);
extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_spike_exchange_compressed_impl)>
    nrnmpi_spike_exchange_compressed;
extern "C" int nrnmpi_int_allmax_impl(int i);
extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_int_allmax_impl)> nrnmpi_int_allmax;
extern "C" void nrnmpi_int_allgather_impl(int* s, int* r, int n);
extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_int_allgather_impl)> nrnmpi_int_allgather;
extern "C" void nrnmpi_int_alltoall_impl(int* s, int* r, int n);
extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_int_alltoall_impl)> nrnmpi_int_alltoall;
extern "C" void nrnmpi_int_alltoallv_impl(const int* s,
                                          const int* scnt,
                                          const int* sdispl,
                                          int* r,
                                          int* rcnt,
                                          int* rdispl);
extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_int_alltoallv_impl)> nrnmpi_int_alltoallv;
extern "C" void nrnmpi_dbl_alltoallv_impl(double* s,
                                          int* scnt,
                                          int* sdispl,
                                          double* r,
                                          int* rcnt,
                                          int* rdispl);
extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_dbl_alltoallv_impl)> nrnmpi_dbl_alltoallv;
extern "C" double nrnmpi_dbl_allmin_impl(double x);
extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_dbl_allmin_impl)> nrnmpi_dbl_allmin;
extern "C" double nrnmpi_dbl_allmax_impl(double x);
extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_dbl_allmax_impl)> nrnmpi_dbl_allmax;
extern "C" void nrnmpi_barrier_impl(void);
extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_barrier_impl)> nrnmpi_barrier;
extern "C" double nrnmpi_dbl_allreduce_impl(double x, int type);
extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_dbl_allreduce_impl)> nrnmpi_dbl_allreduce;
extern "C" void nrnmpi_dbl_allreduce_vec_impl(double* src, double* dest, int cnt, int type);
extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_dbl_allreduce_vec_impl)>
    nrnmpi_dbl_allreduce_vec;
extern "C" void nrnmpi_long_allreduce_vec_impl(long* src, long* dest, int cnt, int type);
extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_long_allreduce_vec_impl)>
    nrnmpi_long_allreduce_vec;
extern "C" bool nrnmpi_initialized_impl();
extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_initialized_impl)> nrnmpi_initialized;
extern "C" void nrnmpi_abort_impl(int);
extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_abort_impl)> nrnmpi_abort;
extern "C" double nrnmpi_wtime_impl();
extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_wtime_impl)> nrnmpi_wtime;
extern "C" int nrnmpi_local_rank_impl();
extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_local_rank_impl)> nrnmpi_local_rank;
extern "C" int nrnmpi_local_size_impl();
extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_local_size_impl)> nrnmpi_local_size;
#if NRN_MULTISEND
extern "C" void nrnmpi_multisend_comm_impl();
extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_multisend_comm_impl)>
    nrnmpi_multisend_comm;
extern "C" void nrnmpi_multisend_impl(NRNMPI_Spike* spk, int n, int* hosts);
extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_multisend_impl)> nrnmpi_multisend;
extern "C" int nrnmpi_multisend_single_advance_impl(NRNMPI_Spike* spk);
extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_multisend_single_advance_impl)>
    nrnmpi_multisend_single_advance;
extern "C" int nrnmpi_multisend_conserve_impl(int nsend, int nrecv);
extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_multisend_conserve_impl)>
    nrnmpi_multisend_conserve;
#endif

}  // namespace coreneuron


================================================
FILE: coreneuron/mpi/nrnmpiuse.h
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#pragma once

/* define to 1 if you want MPI specific features activated
   (optionally provided by CMake option NRNMPI) */
#ifndef NRNMPI
#define NRNMPI 1
#endif

/* define to 1 if want multisend spike exchange available */
#ifndef NRN_MULTISEND
#define NRN_MULTISEND 1
#endif

/* define to 1 if you want parallel distributed cells (and gap junctions) */
#define PARANEURON 1

/* define to 1 if you want the MUSIC - MUlti SImulation Coordinator */
#undef NRN_MUSIC

/* define to the dll path if you want to load automatically */
#undef DLL_DEFAULT_FNAME

/* Number of times to retry a failed open */
#undef FILE_OPEN_RETRY

/* Define to 1 for possibility of rank 0 xopen/ropen a file and broadcast everywhere */
#undef USE_NRNFILEWRAP


================================================
FILE: coreneuron/network/cvodestb.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#include "coreneuron/coreneuron.hpp"
#include "coreneuron/nrnconf.h"
#include "coreneuron/sim/multicore.hpp"
// solver CVode stub to allow cvode as dll for mswindows version.

#include "coreneuron/network/netcvode.hpp"
#include "coreneuron/utils/vrecitem.h"

#include "coreneuron/gpu/nrn_acc_manager.hpp"

namespace coreneuron {

// for fixed step thread
// check thresholds and deliver all (including binqueue) events
// up to t+dt/2
void deliver_net_events(NrnThread* nt) {
    if (net_cvode_instance) {
        net_cvode_instance->check_thresh(nt);
        net_cvode_instance->deliver_net_events(nt);
    }
}

// deliver events (but not binqueue)  up to nt->_t
void nrn_deliver_events(NrnThread* nt) {
    double tsav = nt->_t;
    if (net_cvode_instance) {
        net_cvode_instance->deliver_events(tsav, nt);
    }
    nt->_t = tsav;

    /*before executing on gpu, we have to update the NetReceiveBuffer_t on GPU */
    update_net_receive_buffer(nt);

    for (auto& net_buf_receive: corenrn.get_net_buf_receive()) {
        (*net_buf_receive.first)(nt);
    }
}

void clear_event_queue() {
    if (net_cvode_instance) {
        net_cvode_instance->clear_events();
    }
}

void init_net_events() {
    if (net_cvode_instance) {
        net_cvode_instance->init_events();
    }

#ifdef CORENEURON_ENABLE_GPU
    /* weight vectors could be updated (from INITIAL block of NET_RECEIVE, update those on GPU's */
    for (int ith = 0; ith < nrn_nthread; ++ith) {
        NrnThread* nt = nrn_threads + ith;
        double* weights = nt->weights;
        int n_weight = nt->n_weight;
        if (n_weight && nt->compute_gpu) {
            nrn_pragma_acc(update device(weights [0:n_weight]))
            nrn_pragma_omp(target update to(weights [0:n_weight]))
        }
    }
#endif
}

void nrn_play_init() {
    for (int ith = 0; ith < nrn_nthread; ++ith) {
        NrnThread* nt = nrn_threads + ith;
        for (int i = 0; i < nt->n_vecplay; ++i) {
            ((PlayRecord*) nt->_vecplay[i])->play_init();
        }
    }
}

void fixed_play_continuous(NrnThread* nt) {
    for (int i = 0; i < nt->n_vecplay; ++i) {
        ((PlayRecord*) nt->_vecplay[i])->continuous(nt->_t);
    }
}

}  // namespace coreneuron


================================================
FILE: coreneuron/network/have2want.h
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

/*
To be included by a file that desires rendezvous rank exchange functionality.
Need to define HAVEWANT_t, HAVEWANT_alltoallv, and HAVEWANT2Int
*/

#ifdef have2want_h
#error "This implementation can only be included once"
/* The static function names could involve a macro name. */
#endif

#define have2want_h

#include "coreneuron/utils/nrnoc_aux.hpp"
#include "coreneuron/apps/corenrn_parameters.hpp"
#include "coreneuron/mpi/core/nrnmpi.hpp"

/*

A rank owns a set of HAVEWANT_t keys and wants information associated with
a set of HAVEWANT_t keys owned by unknown ranks.  Owners do not know which
ranks want their information. Ranks that want info do not know which ranks
own that info.

The have_to_want function returns two new vectors of keys along with
associated count and displacement vectors of length nrnmpi_numprocs and nrnmpi_numprocs+1
respectively. Note that a send_to_want_displ[i+1] =
  send_to_want_cnt[i] + send_to_want_displ[i] .

send_to_want[send_to_want_displ[i] to send_to_want_displ[i+1]] contains
the keys from this rank for which rank i wants information.

recv_from_have[recv_from_have_displ[i] to recv_from_have_displ[i+1] contains
the keys from which rank i is sending information to this rank.

Note that on rank i, the order of keys in the rank j area of send_to_want
is the same order of keys on rank j in the ith area in recv_from_have.

The rendezvous_rank function is used to parallelize this computation
and minimize memory usage so that no single rank ever needs to know all keys.
*/

#ifndef HAVEWANT_t
#define HAVEWANT_t int
#endif
namespace coreneuron {
// round robin default rendezvous rank function
static int default_rendezvous(HAVEWANT_t key) {
    return key % nrnmpi_numprocs;
}

static int* cnt2displ(int* cnt) {
    int* displ = new int[nrnmpi_numprocs + 1];
    displ[0] = 0;
    for (int i = 0; i < nrnmpi_numprocs; ++i) {
        displ[i + 1] = displ[i] + cnt[i];
    }
    return displ;
}

static int* srccnt2destcnt(int* srccnt) {
    int* destcnt = new int[nrnmpi_numprocs];
#if NRNMPI
    if (corenrn_param.mpi_enable) {
        nrnmpi_int_alltoall(srccnt, destcnt, 1);
    } else
#endif
    {
        for (int i = 0; i < nrnmpi_numprocs; ++i) {
            destcnt[i] = srccnt[i];
        }
    }
    return destcnt;
}

static void rendezvous_rank_get(HAVEWANT_t* data,
                                int size,
                                HAVEWANT_t*& sdata,
                                int*& scnt,
                                int*& sdispl,
                                HAVEWANT_t*& rdata,
                                int*& rcnt,
                                int*& rdispl,
                                int (*rendezvous_rank)(HAVEWANT_t)) {
    // count what gets sent
    scnt = new int[nrnmpi_numprocs];
    for (int i = 0; i < nrnmpi_numprocs; ++i) {
        scnt[i] = 0;
    }
    for (int i = 0; i < size; ++i) {
        int r = (*rendezvous_rank)(data[i]);
        ++scnt[r];
    }

    sdispl = cnt2displ(scnt);
    rcnt = srccnt2destcnt(scnt);
    rdispl = cnt2displ(rcnt);
    sdata = new HAVEWANT_t[sdispl[nrnmpi_numprocs]];
    rdata = new HAVEWANT_t[rdispl[nrnmpi_numprocs]];
    // scatter data into sdata by recalculating scnt.
    for (int i = 0; i < nrnmpi_numprocs; ++i) {
        scnt[i] = 0;
    }
    for (int i = 0; i < size; ++i) {
        int r = (*rendezvous_rank)(data[i]);
        sdata[sdispl[r] + scnt[r]] = data[i];
        ++scnt[r];
    }
#if NRNMPI
    if (corenrn_param.mpi_enable) {
        HAVEWANT_alltoallv(sdata, scnt, sdispl, rdata, rcnt, rdispl);
    } else
#endif
    {
        for (int i = 0; i < sdispl[nrnmpi_numprocs]; ++i) {
            rdata[i] = sdata[i];
        }
    }
}

static void have_to_want(HAVEWANT_t* have,
                         int have_size,
                         HAVEWANT_t* want,
                         int want_size,
                         HAVEWANT_t*& send_to_want,
                         int*& send_to_want_cnt,
                         int*& send_to_want_displ,
                         HAVEWANT_t*& recv_from_have,
                         int*& recv_from_have_cnt,
                         int*& recv_from_have_displ,
                         int (*rendezvous_rank)(HAVEWANT_t)) {
    // 1) Send have and want to the rendezvous ranks.
    // 2) Rendezvous rank matches have and want.
    // 3) Rendezvous ranks tell the want ranks which ranks own the keys
    // 4) Ranks that want tell owner ranks where to send.

    // 1) Send have and want to the rendezvous ranks.
    HAVEWANT_t *have_s_data, *have_r_data;
    int *have_s_cnt, *have_s_displ, *have_r_cnt, *have_r_displ;
    rendezvous_rank_get(have,
                        have_size,
                        have_s_data,
                        have_s_cnt,
                        have_s_displ,
                        have_r_data,
                        have_r_cnt,
                        have_r_displ,
                        rendezvous_rank);
    // assume it is an error if two ranks have the same key so create
    // hash table of key2rank. Will also need it for matching have and want
    HAVEWANT2Int havekey2rank = HAVEWANT2Int();
    for (int r = 0; r < nrnmpi_numprocs; ++r) {
        for (int i = 0; i < have_r_cnt[r]; ++i) {
            HAVEWANT_t key = have_r_data[have_r_displ[r] + i];
            if (havekey2rank.find(key) != havekey2rank.end()) {
                char buf[200];
                sprintf(buf, "key %lld owned by multiple ranks\n", (long long) key);
                hoc_execerror(buf, 0);
            }
            havekey2rank[key] = r;
        }
    }
    delete[] have_s_data;
    delete[] have_s_cnt;
    delete[] have_s_displ;
    delete[] have_r_data;
    delete[] have_r_cnt;
    delete[] have_r_displ;

    HAVEWANT_t *want_s_data, *want_r_data;
    int *want_s_cnt, *want_s_displ, *want_r_cnt, *want_r_displ;
    rendezvous_rank_get(want,
                        want_size,
                        want_s_data,
                        want_s_cnt,
                        want_s_displ,
                        want_r_data,
                        want_r_cnt,
                        want_r_displ,
                        rendezvous_rank);

    // 2) Rendezvous rank matches have and want.
    //    we already have made the havekey2rank map.
    // Create an array parallel to want_r_data which contains the ranks that
    // have that data.
    int n = want_r_displ[nrnmpi_numprocs];
    int* want_r_ownerranks = new int[n];
    for (int r = 0; r < nrnmpi_numprocs; ++r) {
        for (int i = 0; i < want_r_cnt[r]; ++i) {
            int ix = want_r_displ[r] + i;
            HAVEWANT_t key = want_r_data[ix];
            if (havekey2rank.find(key) == havekey2rank.end()) {
                char buf[200];
                sprintf(buf, "key = %lld is wanted but does not exist\n", (long long) key);
                hoc_execerror(buf, 0);
            }
            want_r_ownerranks[ix] = havekey2rank[key];
        }
    }
    delete[] want_r_data;

    // 3) Rendezvous ranks tell the want ranks which ranks own the keys
    // The ranks that want keys need to know the ranks that own those keys.
    // The want_s_ownerranks will be parallel to the want_s_data.
    // That is, each item defines the rank from which information associated
    // with that key is coming from
    int* want_s_ownerranks = new int[want_s_displ[nrnmpi_numprocs]];
#if NRNMPI
    if (corenrn_param.mpi_enable) {
        nrnmpi_int_alltoallv(want_r_ownerranks,
                             want_r_cnt,
                             want_r_displ,
                             want_s_ownerranks,
                             want_s_cnt,
                             want_s_displ);
    } else
#endif
    {
        for (int i = 0; i < want_r_displ[nrnmpi_numprocs]; ++i) {
            want_s_ownerranks[i] = want_r_ownerranks[i];
        }
    }
    delete[] want_r_ownerranks;
    delete[] want_r_cnt;
    delete[] want_r_displ;

    // 4) Ranks that want tell owner ranks where to send.
    // Finished with the rendezvous ranks. The ranks that want keys know the
    // owner ranks for those keys. The next step is for the want ranks to
    // tell the owner ranks where to send.
    // The parallel want_s_ownerranks and want_s_data are now uselessly ordered
    // by rendezvous rank. Reorganize so that want ranks can tell owner ranks
    // what they want.
    n = want_s_displ[nrnmpi_numprocs];
    delete[] want_s_displ;
    for (int i = 0; i < nrnmpi_numprocs; ++i) {
        want_s_cnt[i] = 0;
    }
    HAVEWANT_t* old_want_s_data = want_s_data;
    want_s_data = new HAVEWANT_t[n];
    // compute the counts
    for (int i = 0; i < n; ++i) {
        int r = want_s_ownerranks[i];
        ++want_s_cnt[r];
    }
    want_s_displ = cnt2displ(want_s_cnt);
    for (int i = 0; i < nrnmpi_numprocs; ++i) {
        want_s_cnt[i] = 0;
    }  // recount while filling
    for (int i = 0; i < n; ++i) {
        int r = want_s_ownerranks[i];
        HAVEWANT_t key = old_want_s_data[i];
        want_s_data[want_s_displ[r] + want_s_cnt[r]] = key;
        ++want_s_cnt[r];
    }
    delete[] want_s_ownerranks;
    delete[] old_want_s_data;
    want_r_cnt = srccnt2destcnt(want_s_cnt);
    want_r_displ = cnt2displ(want_r_cnt);
    want_r_data = new HAVEWANT_t[want_r_displ[nrnmpi_numprocs]];
#if NRNMPI
    if (corenrn_param.mpi_enable) {
        HAVEWANT_alltoallv(
            want_s_data, want_s_cnt, want_s_displ, want_r_data, want_r_cnt, want_r_displ);
    } else
#endif
    {
        for (int i = 0; i < want_s_displ[nrnmpi_numprocs]; ++i) {
            want_r_data[i] = want_s_data[i];
        }
    }
    // now the want_r_data on the have_ranks are grouped according to the ranks
    // that want those keys.

    send_to_want = want_r_data;
    send_to_want_cnt = want_r_cnt;
    send_to_want_displ = want_r_displ;
    recv_from_have = want_s_data;
    recv_from_have_cnt = want_s_cnt;
    recv_from_have_displ = want_s_displ;
}
}  // namespace coreneuron


================================================
FILE: coreneuron/network/multisend.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#include "coreneuron/nrniv/nrniv_decl.h"
#include "coreneuron/network/multisend.hpp"
#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/network/netcon.hpp"
#include "coreneuron/network/netcvode.hpp"

/*
Overall exchange strategy

When a cell spikes, it immediately does a multisend of
(int gid, double spiketime) to all the target machines that have
cells that need to receive this spike by spiketime + delay.
The MPI implementation does not block due to use of MPI_Isend.

In order to minimize the number of nrnmpi_multisend_conserve tests
(and potentially abandon them altogether if I can ever guarantee
that exchange time is less than half the computation time), I divide the
minimum delay integration intervals into two equal subintervals.
So if a spike is generated in an even subinterval, I do not have
to include it in the conservation check until the end of the next even
subinterval.

When a spike is received (generally MPI_Iprobe, MPI_Recv) it is placed in
even or odd buffers (depending on whether the coded gid is positive or negative)

At the end of a computation subinterval the even or odd buffer spikes
are enqueued in the priority queue after checking that the number
of spikes sent is equal to the number of spikes sent.
*/

// The initial idea behind the optional phase2 is to avoid the large overhead of
// initiating a send of the up to 10k list of target hosts when a cell fires.
// I.e. when there are a small number of cells on a processor, this causes
// load balance problems.
// Load balance should be better if the send is distributed to a much smaller
// set of targets, which, when they receive the spike, pass it on to a neighbor
// set. A non-exclusive alternative to this is the use of RECORD_REPLAY
// which give a very fast initiation but we have not been able to get that
// to complete in the sense of all the targets receiving their spikes before
// the conservation step.
// We expect that phase2 will work best in combination with ENQUEUE=2
// which has the greatest amount of overlap between computation
// and communication.
namespace coreneuron {
bool use_multisend_;
bool use_phase2_;
int n_multisend_interval = 2;

#if NRN_MULTISEND

static int n_xtra_cons_check_;
#define MAXNCONS 10
#if MAXNCONS
static int xtra_cons_hist_[MAXNCONS + 1];
#endif

// ENQUEUE 0 means to  Multisend_ReceiveBuffer buffer -> InputPreSyn.send
// ENQUEUE 1 means to Multisend_ReceiveBuffer buffer -> psbuf -> InputPreSyn.send
// ENQUEUE 2 means to Multisend_ReceiveBuffer.incoming -> InputPrySyn.send
// Note that ENQUEUE 2 give more overlap between computation and exchange
// since the enqueuing takes place during computation except for those
// remaining during conservation.
#define ENQUEUE 2

#if ENQUEUE == 2
static unsigned long enq2_find_time_;
static unsigned long enq2_enqueue_time_;  // includes enq_find_time_
#endif

#define PHASE2BUFFER_SIZE 2048  // power of 2
#define PHASE2BUFFER_MASK (PHASE2BUFFER_SIZE - 1)
struct Phase2Buffer {
    InputPreSyn* ps;
    double spiketime;
    int gid;
};

#define MULTISEND_RECEIVEBUFFER_SIZE 10000
class Multisend_ReceiveBuffer {
  public:
    Multisend_ReceiveBuffer();
    virtual ~Multisend_ReceiveBuffer();
    void init(int index);
    void incoming(int gid, double spiketime);
    void enqueue();
    int index_{};
    int size_{MULTISEND_RECEIVEBUFFER_SIZE};
    int count_{};
    int maxcount_{};
    bool busy_{};
    int nsend_{}, nrecv_{};  // for checking conservation
    int nsend_cell_{};       // cells that spiked this interval.
    NRNMPI_Spike** buffer_{};

    void enqueue1();
    void enqueue2();
    InputPreSyn** psbuf_{};

    void phase2send();
    int phase2_head_{};
    int phase2_tail_{};
    int phase2_nsend_cell_{}, phase2_nsend_{};
    Phase2Buffer* phase2_buffer_{};
};

#define MULTISEND_INTERVAL 2
static Multisend_ReceiveBuffer* multisend_receive_buffer[MULTISEND_INTERVAL];
static int current_rbuf, next_rbuf;
#if MULTISEND_INTERVAL == 2
// note that if a spike is supposed to be received by multisend_receive_buffer[1]
// then during transmission its gid is complemented.
#endif

static int* targets_phase1_;
static int* targets_phase2_;

void nrn_multisend_send(PreSyn* ps, double t, NrnThread* nt) {
    int i = ps->multisend_index_;
    if (i >= 0) {
        // format is cnt, cnt_phase1, array of target ranks.
        // Valid for one or two phase.
        int* ranks = targets_phase1_ + i;
        int cnt = ranks[0];
        int cnt_phase1 = ranks[1];
        ranks += 2;
        NRNMPI_Spike spk;
        spk.gid = ps->output_index_;
        spk.spiketime = t;
        if (next_rbuf == 1) {
            spk.gid = ~spk.gid;
        }
        if (nt == nrn_threads) {
            multisend_receive_buffer[next_rbuf]->nsend_ += cnt;
            multisend_receive_buffer[next_rbuf]->nsend_cell_ += 1;
            nrnmpi_multisend(&spk, cnt_phase1, ranks);
        } else {
            assert(0);
        }
    }
}

static void multisend_send_phase2(InputPreSyn* ps, int gid, double t) {
    int i = ps->multisend_phase2_index_;
    assert(i >= 0);
    // format is cnt_phase2, array of target ranks
    int* ranks = targets_phase2_ + i;
    int cnt_phase2 = ranks[0];
    ranks += 1;
    NRNMPI_Spike spk;
    spk.gid = gid;
    spk.spiketime = t;
    nrnmpi_multisend(&spk, cnt_phase2, ranks);
}

Multisend_ReceiveBuffer::Multisend_ReceiveBuffer()
    : buffer_ {
    new NRNMPI_Spike*[size_]
}
#if ENQUEUE == 1
, psbuf_ {
    new InputPreSyn*[size_]
}
#endif
, phase2_buffer_{new Phase2Buffer[PHASE2BUFFER_SIZE]} {}

Multisend_ReceiveBuffer::~Multisend_ReceiveBuffer() {
    nrn_assert(!busy_);
    for (int i = 0; i < count_; ++i) {
        delete buffer_[i];
    }
    delete[] buffer_;
    if (psbuf_)
        delete[] psbuf_;
    delete[] phase2_buffer_;
}
void Multisend_ReceiveBuffer::init(int index) {
    index_ = index;
    nsend_cell_ = nsend_ = nrecv_ = maxcount_ = 0;
    busy_ = false;
    for (int i = 0; i < count_; ++i) {
        delete buffer_[i];
    }
    count_ = 0;

    phase2_head_ = phase2_tail_ = 0;
    phase2_nsend_cell_ = phase2_nsend_ = 0;
}
void Multisend_ReceiveBuffer::incoming(int gid, double spiketime) {
    // printf("%d %p.incoming %g %g %d\n", nrnmpi_myid, this, t, spk->spiketime, spk->gid);
    nrn_assert(!busy_);
    busy_ = true;

    if (count_ >= size_) {
        size_ *= 2;
        NRNMPI_Spike** newbuf = new NRNMPI_Spike*[size_];
        for (int i = 0; i < count_; ++i) {
            newbuf[i] = buffer_[i];
        }
        delete[] buffer_;
        buffer_ = newbuf;
        if (psbuf_) {
            delete[] psbuf_;
            psbuf_ = new InputPreSyn*[size_];
        }
    }
    NRNMPI_Spike* spk = new NRNMPI_Spike();
    spk->gid = gid;
    spk->spiketime = spiketime;
    buffer_[count_++] = spk;
    if (maxcount_ < count_) {
        maxcount_ = count_;
    }

    ++nrecv_;
    busy_ = false;
}
void Multisend_ReceiveBuffer::enqueue() {
    // printf("%d %p.enqueue count=%d t=%g nrecv=%d nsend=%d\n", nrnmpi_myid, this, t, count_,
    // nrecv_, nsend_);
    nrn_assert(!busy_);
    busy_ = true;

    for (int i = 0; i < count_; ++i) {
        NRNMPI_Spike* spk = buffer_[i];

        auto gid2in_it = gid2in.find(spk->gid);
        assert(gid2in_it != gid2in.end());
        InputPreSyn* ps = gid2in_it->second;

        if (use_phase2_ && ps->multisend_phase2_index_ >= 0) {
            Phase2Buffer& pb = phase2_buffer_[phase2_head_++];
            phase2_head_ &= PHASE2BUFFER_MASK;
            assert(phase2_head_ != phase2_tail_);
            pb.ps = ps;
            pb.spiketime = spk->spiketime;
            pb.gid = spk->gid;
        }

        ps->send(spk->spiketime, net_cvode_instance, nrn_threads);
        delete spk;
    }

    count_ = 0;
#if ENQUEUE != 2
    nrecv_ = 0;
    nsend_ = 0;
    nsend_cell_ = 0;
#endif
    busy_ = false;
    phase2send();
}

void Multisend_ReceiveBuffer::enqueue1() {
    // printf("%d %lx.enqueue count=%d t=%g nrecv=%d nsend=%d\n", nrnmpi_myid, (long)this, t,
    // count_, nrecv_, nsend_);
    nrn_assert(!busy_);
    busy_ = true;
    for (int i = 0; i < count_; ++i) {
        NRNMPI_Spike* spk = buffer_[i];

        auto gid2in_it = gid2in.find(spk->gid);
        assert(gid2in_it != gid2in.end());
        InputPreSyn* ps = gid2in_it->second;
        psbuf_[i] = ps;
        if (use_phase2_ && ps->multisend_phase2_index_ >= 0) {
            Phase2Buffer& pb = phase2_buffer_[phase2_head_++];
            phase2_head_ &= PHASE2BUFFER_MASK;
            assert(phase2_head_ != phase2_tail_);
            pb.ps = ps;
            pb.spiketime = spk->spiketime;
            pb.gid = spk->gid;
        }
    }
    busy_ = false;
    phase2send();
}

void Multisend_ReceiveBuffer::enqueue2() {
    // printf("%d %lx.enqueue count=%d t=%g nrecv=%d nsend=%d\n", nrnmpi_myid, (long)this, t,
    // count_, nrecv_, nsend_);
    nrn_assert(!busy_);
    busy_ = false;
    for (int i = 0; i < count_; ++i) {
        NRNMPI_Spike* spk = buffer_[i];
        InputPreSyn* ps = psbuf_[i];
        ps->send(spk->spiketime, net_cvode_instance, nrn_threads);
        delete spk;
    }
    count_ = 0;
    nrecv_ = 0;
    nsend_ = 0;
    nsend_cell_ = 0;
    busy_ = false;
}

void Multisend_ReceiveBuffer::phase2send() {
    while (phase2_head_ != phase2_tail_) {
        Phase2Buffer& pb = phase2_buffer_[phase2_tail_++];
        phase2_tail_ &= PHASE2BUFFER_MASK;
        int gid = pb.gid;
        if (index_) {
            gid = ~gid;
        }
        multisend_send_phase2(pb.ps, gid, pb.spiketime);
    }
}

static int max_ntarget_host;
// For one phase sending, max_multisend_targets is max_ntarget_host.
// For two phase sending, it is the maximum of all the
// ntarget_hosts_phase1 and ntarget_hosts_phase2.
static int max_multisend_targets;

void nrn_multisend_init() {
    for (int i = 0; i < n_multisend_interval; ++i) {
        multisend_receive_buffer[i]->init(i);
    }
    current_rbuf = 0;
    next_rbuf = n_multisend_interval - 1;
#if ENQUEUE == 2
    enq2_find_time_ = enq2_enqueue_time_ = 0;
#endif
    n_xtra_cons_check_ = 0;
#if MAXNCONS
    for (int i = 0; i <= MAXNCONS; ++i) {
        xtra_cons_hist_[i] = 0;
    }
#endif  // MAXNCONS
}

static int multisend_advance() {
    NRNMPI_Spike spk;
    int i = 0;
    while (nrnmpi_multisend_single_advance(&spk)) {
        i += 1;
        int j = 0;
#if MULTISEND_INTERVAL == 2
        if (spk.gid < 0) {
            spk.gid = ~spk.gid;
            j = 1;
        }
#endif
        multisend_receive_buffer[j]->incoming(spk.gid, spk.spiketime);
    }
    return i;
}

#if NRN_MULTISEND
void nrn_multisend_advance() {
    if (use_multisend_) {
        multisend_advance();
#if ENQUEUE == 2
        multisend_receive_buffer[current_rbuf]->enqueue();
#endif
    }
}
#endif

void nrn_multisend_receive(NrnThread* nt) {
    //	nrn_spike_exchange();
    assert(nt == nrn_threads);
    //	double w1, w2;
    int ncons = 0;
    int& s = multisend_receive_buffer[current_rbuf]->nsend_;
    int& r = multisend_receive_buffer[current_rbuf]->nrecv_;
//	w1 = nrn_wtime();
#if NRN_MULTISEND & 1
    if (use_multisend_) {
        nrn_multisend_advance();
        nrnmpi_barrier();
        nrn_multisend_advance();
        // with two phase we expect conservation to hold and ncons should
        // be 0.
        while (nrnmpi_multisend_conserve(s, r) != 0) {
            nrn_multisend_advance();
            ++ncons;
        }
    }
#endif
    //	w1 = nrn_wtime() - w1;
    //	w2 = nrn_wtime();

#if ENQUEUE == 0
    multisend_receive_buffer[current_rbuf]->enqueue();
#endif
#if ENQUEUE == 1
    multisend_receive_buffer[current_rbuf]->enqueue1();
    multisend_receive_buffer[current_rbuf]->enqueue2();
#endif
#if ENQUEUE == 2
    multisend_receive_buffer[current_rbuf]->enqueue();
    s = r = multisend_receive_buffer[current_rbuf]->nsend_cell_ = 0;

    multisend_receive_buffer[current_rbuf]->phase2_nsend_cell_ = 0;
    multisend_receive_buffer[current_rbuf]->phase2_nsend_ = 0;

    enq2_find_time_ = 0;
    enq2_enqueue_time_ = 0;
#endif  // ENQUEUE == 2
//	wt1_ = nrn_wtime() - w2;
//	wt_ = w1;
#if MULTISEND_INTERVAL == 2
    // printf("%d reverse buffers %g\n", nrnmpi_myid, t);
    if (n_multisend_interval == 2) {
        current_rbuf = next_rbuf;
        next_rbuf = ((next_rbuf + 1) & 1);
    }
#endif
}

void nrn_multisend_cleanup() {
    if (targets_phase1_) {
        delete[] targets_phase1_;
        targets_phase1_ = nullptr;
    }

    if (targets_phase2_) {
        delete[] targets_phase2_;
        targets_phase2_ = nullptr;
    }

    // cleanup MultisendReceiveBuffer here as well
}

void nrn_multisend_setup() {
    nrn_multisend_cleanup();
    if (!use_multisend_) {
        return;
    }
    nrnmpi_multisend_comm();
    // if (nrnmpi_myid == 0) printf("multisend_setup()\n");
    // although we only care about the set of hosts that gid2out_
    // sends spikes to (source centric). We do not want to send
    // the entire list of gid2in (which may be 10000 times larger
    // than gid2out) from every machine to every machine.
    // so we accomplish the task in two phases the first of which
    // involves allgather with a total receive buffer size of number
    // of cells (even that is too large and we will split it up
    // into chunks). And the second, an
    // allreduce with receive buffer size of number of hosts.
    max_ntarget_host = 0;
    max_multisend_targets = 0;

    // completely new algorithm does one and two phase.
    nrn_multisend_setup_targets(use_phase2_, targets_phase1_, targets_phase2_);

    if (!multisend_receive_buffer[0]) {
        multisend_receive_buffer[0] = new Multisend_ReceiveBuffer();
    }
#if MULTISEND_INTERVAL == 2
    if (n_multisend_interval == 2 && !multisend_receive_buffer[1]) {
        multisend_receive_buffer[1] = new Multisend_ReceiveBuffer();
    }
#endif
}
#endif  // NRN_MULTISEND
}  // namespace coreneuron


================================================
FILE: coreneuron/network/multisend.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#pragma once

#include "coreneuron/mpi/nrnmpiuse.h"
namespace coreneuron {
extern bool use_multisend_;
extern int n_multisend_interval;
extern bool use_phase2_;

class PreSyn;
struct NrnThread;

void nrn_multisend_send(PreSyn*, double t, NrnThread*);
void nrn_multisend_receive(NrnThread*);  // must be thread 0
void nrn_multisend_advance();
void nrn_multisend_init();

void nrn_multisend_cleanup();
void nrn_multisend_setup();

void nrn_multisend_setup_targets(bool use_phase2, int*& targets_phase1, int*& targets_phase2);
}  // namespace coreneuron


================================================
FILE: coreneuron/network/multisend_setup.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#include <cstdio>
#include <cmath>
#include <numeric>

#if CORENRN_DEBUG
#include <fstream>
#include <iomanip>
#endif

#include "coreneuron/utils/randoms/nrnran123.h"
#include "coreneuron/nrniv/nrniv_decl.h"
#include "coreneuron/network/multisend.hpp"
#include "coreneuron/mpi/nrnmpidec.h"
#include "coreneuron/mpi/core/nrnmpi.hpp"
#include "coreneuron/utils/memory_utils.h"
#include "coreneuron/utils/utils.hpp"
/*
For very large numbers of processors and cells and fanout, it is taking
a long time to figure out each cells target list given the input gids
(gid2in) on each host. e.g 240 seconds for 2^25 cells, 1k connections
per cell, and 128K cores; and 340 seconds for two phase excchange.
To reduce this setup time we experiment with a very different algorithm in which
we construct a gid target host list on host gid%nhost and copy that list to
the source host owning the gid.
*/

#if NRN_MULTISEND
namespace coreneuron {
using Gid2IPS = std::map<int, InputPreSyn*>;
using Gid2PS = std::map<int, PreSyn*>;

#if CORENRN_DEBUG
template <typename T>
static void celldebug(const char* p, T& map) {
    std::string fname = std::string("debug.") + std::to_string(nrnmpi_myid);
    std::ofstream f(fname, std::ios::app);
    f << std::endl << p << std::endl;
    int rank = nrnmpi_myid;
    f << "  " << std::setw(2) << std::setfill('0') << rank << ":";
    for (const auto& m: map) {
        int gid = m.first;
        f << "  " << std::setw(2) << std::setfill('0') << gid << ":";
    }
    f << std::endl;
}

static void alltoalldebug(const char* p,
                          const std::vector<int>& s,
                          const std::vector<int>& scnt,
                          const std::vector<int>& sdispl,
                          const std::vector<int>& r,
                          const std::vector<int>& rcnt,
                          const std::vector<int>& rdispl) {
    std::string fname = std::string("debug.") + std::to_string(nrnmpi_myid);
    std::ofstream f(fname, std::ios::app);
    f << std::endl << p << std::endl;
    int rank = nrnmpi_myid;
    f << "  rank " << rank << std::endl;
    for (int i = 0; i < nrnmpi_numprocs; ++i) {
        f << "    s" << i << " : " << scnt[i] << " " << sdispl[i] << " :";
        for (int j = sdispl[i]; j < sdispl[i + 1]; ++j) {
            f << "  " << std::setw(2) << std::setfill('0') << s[j] << ":";
        }
        f << std::endl;
    }
    for (int i = 0; i < nrnmpi_numprocs; ++i) {
        f << "    r" << i << " : " << rcnt[i] << " " << rdispl[i] << " :";
        for (int j = rdispl[i]; j < rdispl[i + 1]; ++j) {
            f << "  " << std::setw(2) << std::setfill('0') << r[j] << ":";
        }
        f << std::endl;
    }
}
#else
template <typename T>
static void celldebug(const char*, T&) {}
static void alltoalldebug(const char*,
                          const std::vector<int>&,
                          const std::vector<int>&,
                          const std::vector<int>&,
                          const std::vector<int>&,
                          const std::vector<int>&,
                          const std::vector<int>&) {}
#endif

#if CORENRN_DEBUG
void phase1debug(int* targets_phase1) {
    std::string fname = std::string("debug.") + std::to_string(nrnmpi_myid);
    std::ofstream f(fname, std::ios::app);
    f << std::endl << "phase1debug " << nrnmpi_myid;
    for (auto& g: gid2out) {
        PreSyn* ps = g.second;
        f << std::endl << " " << std::setw(2) << std::setfill('0') << ps->gid_ << ":";
        int* ranks = targets_phase1 + ps->multisend_index_;
        int n = ranks[1];
        ranks += 2;
        for (int i = 0; i < n; ++i) {
            f << " " << std::setw(2) << std::setfill('0') << ranks[i];
        }
    }
    f << std::endl;
}

void phase2debug(int* targets_phase2) {
    std::string fname = std::string("debug.") + std::to_string(nrnmpi_myid);
    std::ofstream f(fname, std::ios::app);
    f << std::endl << "phase2debug " << nrnmpi_myid;
    for (auto& g: gid2in) {
        int gid = g.first;
        InputPreSyn* ps = g.second;
        f << std::endl << " " << std::setw(2) << std::setfill('0') << gid << ":";
        int j = ps->multisend_phase2_index_;
        if (j >= 0) {
            int* ranks = targets_phase2 + j;
            int cnt = ranks[0];
            ranks += 1;
            for (int i = 0; i < cnt; ++i) {
                f << " " << std::setw(2) << std::setfill('0') << ranks[i];
            }
        }
    }
    f << std::endl;
}
#endif

static std::vector<int> newoffset(const std::vector<int>& acnt) {
    std::vector<int> aoff(acnt.size() + 1);
    aoff[0] = 0;
    std::partial_sum(acnt.begin(), acnt.end(), aoff.begin() + 1);
    return aoff;
}

// input: scnt, sdispl; output: rcnt, rdispl
static std::pair<std::vector<int>, std::vector<int>> all2allv_helper(const std::vector<int>& scnt) {
    int np = nrnmpi_numprocs;
    std::vector<int> c(np, 1);
    std::vector<int> rdispl = newoffset(c);
    std::vector<int> rcnt(np, 0);
    nrnmpi_int_alltoallv(
        scnt.data(), c.data(), rdispl.data(), rcnt.data(), c.data(), rdispl.data());
    rdispl = newoffset(rcnt);
    return std::make_pair(std::move(rcnt), std::move(rdispl));
}

/*
define following to 1 if desire space/performance information such as:
all2allv_int gidin to intermediate space=1552 total=37345104 time=0.000495835
all2allv_int gidout space=528 total=37379376 time=1.641e-05
all2allv_int lists space=3088 total=37351312 time=4.4708e-05
*/

#define all2allv_perf 0

// input: s, scnt, sdispl; output: r, rdispl
static std::pair<std::vector<int>, std::vector<int>> all2allv_int(const std::vector<int>& s,
                                                                  const std::vector<int>& scnt,
                                                                  const std::vector<int>& sdispl,
                                                                  const char* dmes) {
#if all2allv_perf
    double tm = nrn_wtime();
#endif
    int np = nrnmpi_numprocs;

    std::vector<int> rcnt;
    std::vector<int> rdispl;
    std::tie(rcnt, rdispl) = all2allv_helper(scnt);
    std::vector<int> r(rdispl[np], 0);
    nrnmpi_int_alltoallv(
        s.data(), scnt.data(), sdispl.data(), r.data(), rcnt.data(), rdispl.data());
    alltoalldebug(dmes, s, scnt, sdispl, r, rcnt, rdispl);

#if all2allv_perf
    if (nrnmpi_myid == 0) {
        int nb = 4 * nrnmpi_numprocs + sdispl[nrnmpi_numprocs] + rdispl[nrnmpi_numprocs];
        tm = nrn_wtime() - tm;
        printf("all2allv_int %s space=%d total=%g time=%g\n", dmes, nb, nrn_mallinfo(), tm);
    }
#endif
    return std::make_pair(std::move(r), std::move(rdispl));
}

class TarList {
  public:
    TarList();
    virtual ~TarList();
    virtual void alloc();
    int size;
    int* list;
    int rank;

    int* indices;  // indices of list for groups of phase2 targets.
                   // If indices is not null, then size is one less than
                   // the size of the indices list where indices[size] = the size of
                   // the list. Indices[0] is 0 and list[indices[i]] is the rank
                   // to send the ith group of phase2 targets.
};

using Int2TarList = std::map<int, TarList*>;

TarList::TarList()
    : size(0)
    , list(nullptr)
    , rank(-1)
    , indices(nullptr) {}

TarList::~TarList() {
    delete[] list;
    delete[] indices;
}

void TarList::alloc() {
    if (size) {
        list = new int[size];
    }
}

// for two phase

static nrnran123_State* ranstate{nullptr};

static void random_init(int i) {
    if (!ranstate) {
        ranstate = nrnran123_newstream(i, 0);
    }
}

static unsigned int get_random() {
    return nrnran123_ipick(ranstate);
}

// Avoid warnings if the global index is changed on subsequent psolve.
static void random_delete() {
    if (ranstate) {
        nrnran123_deletestream(ranstate);
        ranstate = nullptr;
    }
}

static int iran(int i1, int i2) {
    // discrete uniform random integer from i2 to i2 inclusive. Must
    // work if i1 == i2
    if (i1 == i2) {
        return i1;
    }
    int i3 = i1 + get_random() % (i2 - i1 + 1);
    return i3;
}

static void phase2organize(TarList* tl) {
    int nt = tl->size;
    int n = int(sqrt(double(nt)));
    // change to about 20
    if (n > 1) {  // do not bother if not many connections
        // equal as possible group sizes
        tl->indices = new int[n + 1];
        tl->indices[n] = tl->size;
        tl->size = n;
        for (int i = 0; i < n; ++i) {
            tl->indices[i] = (i * nt) / n;
        }
        // Note: not sure the following is true anymore but it could be.
        // This distribution is very biased (if 0 is a phase1 target
        // it is always a phase2 sender. So now choose a random
        // target in the subset and make that the phase2 sender
        // (need to switch the indices[i] target and the one chosen)
        for (int i = 0; i < n; ++i) {
            int i1 = tl->indices[i];
            int i2 = tl->indices[i + 1] - 1;
            // need discrete uniform random integer from i1 to i2
            int i3 = iran(i1, i2);
            int itar = tl->list[i1];
            tl->list[i1] = tl->list[i3];
            tl->list[i3] = itar;
        }
    }
}

// end of twophase

/*
Setting up target lists uses a lot of temporary memory. It is conceiveable
that this can be done prior to creating any cells or connections. I.e.
gid2out is presently known from pc.set_gid2node(gid,...). Gid2in is presenly
known from NetCon = pc.gid_connect(gid, target) and it is quite a style
and hoc network programming change to use something like pc.need_gid(gid)
before cells with their synapses are created since one would have to imagine
that the hoc network setup code would have to be executed in a virtual
or 'abstract' fashion without actually creating, cells, targets, or NetCons.
Anyway, to potentially support this in the future, we write setup_target_lists
to not use any PreSyn information.
*/

static std::vector<int> setup_target_lists(bool);
static void fill_multisend_lists(bool, const std::vector<int>&, int*&, int*&);

void nrn_multisend_setup_targets(bool use_phase2, int*& targets_phase1, int*& targets_phase2) {
    auto r = setup_target_lists(use_phase2);

    // initialize as unused
    for (auto& g: gid2out) {
        PreSyn* ps = g.second;
        ps->multisend_index_ = -1;
    }

    // Only will be not -1 if non-nullptr input is a phase 2 sender.
    for (auto& g: gid2in) {
        InputPreSyn* ps = g.second;
        ps->multisend_phase2_index_ = -1;
    }

    fill_multisend_lists(use_phase2, r, targets_phase1, targets_phase2);

    // phase1debug(targets_phase1);
    // phase2debug(targets_phase2);
}

// Some notes about threads and the rank lists.
// Assume all MPI message sent and received from a single thread (0).
// gid2in and gid2out are rank wide lists for all threads
//
static void fill_multisend_lists(bool use_phase2,
                                 const std::vector<int>& r,
                                 int*& targets_phase1,
                                 int*& targets_phase2) {
    // sequence of gid, size, [totalsize], list
    // Note that totalsize is there only for output gid's and use_phase2.
    // Using this sequence, copy lists to proper phase
    // 1 and phase 2 lists. (Phase one lists found in gid2out_ and phase
    // two lists found in gid2in_.
    int phase1_index = 0;
    int phase2_index = 0;
    // Count and fill in multisend_index and multisend_phase2_index_
    // From the counts can allocate targets_phase1 and targets_phase2
    // Then can iterate again and copy r to proper target locations.
    for (std::size_t i = 0; i < r.size();) {
        InputPreSyn* ips = nullptr;
        int gid = r[i++];
        int size = r[i++];
        if (use_phase2) {  // look in gid2in first
            auto gid2in_it = gid2in.find(gid);
            if (gid2in_it != gid2in.end()) {  // phase 2 target list
                ips = gid2in_it->second;
                ips->multisend_phase2_index_ = phase2_index;
                phase2_index += 1 + size;  // count + ranks
                i += size;
            }
        }
        if (!ips) {  // phase 1 target list (or whole list if use_phase2 is 0)
            auto gid2out_it = gid2out.find(gid);
            assert(gid2out_it != gid2out.end());
            PreSyn* ps = gid2out_it->second;
            ps->multisend_index_ = phase1_index;
            phase1_index += 2 + size;  // total + count + ranks
            if (use_phase2) {
                i++;
            }
            i += size;
        }
    }

    targets_phase1 = new int[phase1_index];
    targets_phase2 = new int[phase2_index];

    // printf("%d sz=%d\n", nrnmpi_myid, r.size());
    for (std::size_t i = 0; i < r.size();) {
        InputPreSyn* ips = nullptr;
        int gid = r[i++];
        int size = r[i++];
        if (use_phase2) {  // look in gid2in first
            auto gid2in_it = gid2in.find(gid);
            if (gid2in_it != gid2in.end()) {  // phase 2 target list
                ips = gid2in_it->second;
                int p = ips->multisend_phase2_index_;
                int* ranks = targets_phase2 + p;
                ranks[0] = size;
                ranks += 1;
                // printf("%d i=%d gid=%d phase2 size=%d\n", nrnmpi_myid, i, gid, size);
                for (int j = 0; j < size; ++j) {
                    ranks[j] = r[i++];
                    // printf("%d   j=%d rank=%d\n", nrnmpi_myid, j, ranks[j]);
                    assert(ranks[j] != nrnmpi_myid);
                }
            }
        }
        if (!ips) {  // phase 1 target list (or whole list if use_phase2 is 0)
            auto gid2out_it = gid2out.find(gid);
            assert(gid2out_it != gid2out.end());
            PreSyn* ps = gid2out_it->second;
            int p = ps->multisend_index_;
            int* ranks = targets_phase1 + p;
            int total = size;
            if (use_phase2) {
                total = r[i++];
            }
            ranks[0] = total;
            ranks[1] = size;
            ranks += 2;
            // printf("%d i=%d gid=%d phase1 size=%d total=%d\n", nrnmpi_myid, i, gid, size, total);
            for (int j = 0; j < size; ++j) {
                ranks[j] = r[i++];
                // printf("%d   j=%d rank=%d\n", nrnmpi_myid, j, ranks[j]);
                // There never was a possibility of send2self
                // because an output presyn is never in gid2in_.
                assert(ranks[j] != nrnmpi_myid);
            }
        }
    }

    // compute max_ntarget_host and max_multisend_targets
    int max_ntarget_host = 0;
    int max_multisend_targets = 0;
    for (auto& g: gid2out) {
        PreSyn* ps = g.second;
        if (ps->output_index_ >= 0) {  // only ones that generate spikes
            int i = ps->multisend_index_;
            if (i >= 0) {  // only if the gid has targets on other ranks.
                max_ntarget_host = std::max(targets_phase1[i], max_ntarget_host);
                max_multisend_targets = std::max(targets_phase1[i + 1], max_multisend_targets);
            }
        }
    }
    if (use_phase2) {
        for (auto& g: gid2in) {
            InputPreSyn* ps = g.second;
            int i = ps->multisend_phase2_index_;
            if (i >= 0) {
                max_multisend_targets = std::max(max_multisend_targets, targets_phase2[i]);
            }
        }
    }
}

// Return the vector encoding a sequence of gid, target list size, and target list
static std::vector<int> setup_target_lists(bool use_phase2) {
    int nhost = nrnmpi_numprocs;

    // Construct hash table for finding the target rank list for a given gid.
    Int2TarList gid2tarlist;

    celldebug<Gid2PS>("output gid", gid2out);
    celldebug<Gid2IPS>("input gid", gid2in);

    // What are the target ranks for a given input gid. All the ranks
    // with the same input gid send that gid to the intermediate
    // gid%nhost rank. The intermediate rank can then construct the
    // list of target ranks for the gids it gets.

    {
        // scnt1 is number of input gids from target
        std::vector<int> scnt1(nhost, 0);
        for (const auto& g: gid2in) {
            int gid = g.first;
            ++scnt1[gid % nhost];
        }

        // s1 are the input gids from target to be sent to the various intermediates
        const std::vector<int> sdispl1 = newoffset(scnt1);
        // Make an usable copy
        auto sdispl1_ = sdispl1;
        std::vector<int> s1(sdispl1[nhost], 0);
        for (const auto& g: gid2in) {
            int gid = g.first;
            s1[sdispl1_[gid % nhost]++] = gid;
        }

        std::vector<int> r1;
        std::vector<int> rdispl1;
        std::tie(r1, rdispl1) = all2allv_int(s1, scnt1, sdispl1, "gidin to intermediate");
        // r1 is the gids received by this intermediate rank from all other ranks.

        // Now figure out the size of the target list for each distinct gid in r1.
        for (const auto& gid: r1) {
            if (gid2tarlist.find(gid) == gid2tarlist.end()) {
                gid2tarlist[gid] = new TarList{};
                gid2tarlist[gid]->size = 0;
            }
            auto tar = gid2tarlist[gid];
            ++(tar->size);
        }

        // Conceptually, now the intermediate is the mpi source and the gid
        // sources are the mpi destination in regard to target lists.
        // It would be possible at this point, but confusing,
        // to allocate a s[rdispl1[nhost]] and figure out scnt and sdispl by
        // by getting the counts and gids from the ranks that own the source
        // gids. In this way we could organize s without having to allocate
        // individual target lists on the intermediate and then allocate
        // another large s buffer to receive a copy of them. However for
        // this processing we already require two large buffers for input
        // gid's so there is no real savings of space.
        // So let's do the simple obvious sequence and now complete the
        // target lists.

        // Allocate the target lists (and set size to 0 (we will recount when filling).
        for (const auto& g: gid2tarlist) {
            TarList* tl = g.second;
            tl->alloc();
            tl->size = 0;
        }

        // fill the target lists
        for (int rank = 0; rank < nhost; ++rank) {
            int b = rdispl1[rank];
            int e = rdispl1[rank + 1];
            for (int i = b; i < e; ++i) {
                const auto itl_it = gid2tarlist.find(r1[i]);
                if (itl_it != gid2tarlist.end()) {
                    TarList* tl = itl_it->second;
                    tl->list[tl->size] = rank;
                    tl->size++;
                }
            }
        }
    }

    {
        // Now the intermediate hosts have complete target lists and
        // the sources know the intermediate host from the gid2out_ map.
        // We could potentially organize here for two-phase exchange as well.

        // Which target lists are desired by the source rank?

        // Ironically, for round robin distributions, the target lists are
        // already on the proper source rank so the following code should
        // be tested for random distributions of gids.
        // How many on the source rank?
        std::vector<int> scnt2(nhost, 0);
        for (auto& g: gid2out) {
            int gid = g.first;
            PreSyn* ps = g.second;
            if (ps->output_index_ >= 0) {  // only ones that generate spikes
                ++scnt2[gid % nhost];
            }
        }
        const auto sdispl2 = newoffset(scnt2);
        auto sdispl2_ = sdispl2;

        // what are the gids of those target lists
        std::vector<int> s2(sdispl2[nhost], 0);
        for (auto& g: gid2out) {
            int gid = g.first;
            PreSyn* ps = g.second;
            if (ps->output_index_ >= 0) {  // only ones that generate spikes
                s2[sdispl2_[gid % nhost]++] = gid;
            }
        }
        std::vector<int> r2;
        std::vector<int> rdispl2;
        std::tie(r2, rdispl2) = all2allv_int(s2, scnt2, sdispl2, "gidout");

        // fill in the tl->rank for phase 1 target lists
        // r2 is an array of source spiking gids
        // tl is list associating input gids with list of target ranks.
        for (int rank = 0; rank < nhost; ++rank) {
            int b = rdispl2[rank];
            int e = rdispl2[rank + 1];
            for (int i = b; i < e; ++i) {
                // note that there may be input gids with no corresponding
                // output gid so that the find may not return true and in
                // that case the tl->rank remains -1.
                // For example multisplit gids or simulation of a subset of
                // cells.
                const auto itl_it = gid2tarlist.find(r2[i]);
                if (itl_it != gid2tarlist.end()) {
                    TarList* tl = itl_it->second;
                    tl->rank = rank;
                }
            }
        }
    }

    if (use_phase2) {
        random_init(nrnmpi_myid + 1);
        for (const auto& gid2tar: gid2tarlist) {
            TarList* tl = gid2tar.second;
            if (tl->rank >= 0) {  // only if output gid is spike generating
                phase2organize(tl);
            }
        }
        random_delete();
    }

    // For clarity, use the all2allv_int style of information flow
    // from source to destination as above
    // and also use a uniform code
    // for copying one and two phase information from a TarList to
    // develop the s, scnt, and sdispl3 buffers. That is, a buffer list
    // section in s for either a one-phase list or the much shorter
    // (individually) lists for first and second phases, has a
    // gid, size, totalsize header for each list where totalsize
    // is only present if the gid is an output gid (for
    // NrnMultisend_Send.ntarget_host used for conservation).
    // Note that totalsize is tl->indices[tl->size]

    // how much to send to each rank
    std::vector<int> scnt3(nhost, 0);
    for (const auto& gid2tar: gid2tarlist) {
        TarList* tl = gid2tar.second;
        if (tl->rank < 0) {
            // When the output gid does not generate spikes, that rank
            // is not interested if there is a target list for it.
            // If the output gid does not exist, there is no rank.
            // In either case ignore this target list.
            continue;
        }
        if (tl->indices) {
            // indices[size] is the size of list but size of those
            // are the sublist phase 2 destination ranks which
            // don't get sent as part of the phase 2 target list.
            // Also there is a phase 1 target list of size so there
            // are altogether size+1 target lists.
            // (one phase 1 list and size phase 2 lists)
            scnt3[tl->rank] += tl->size + 2;  // gid, size, list
            for (int i = 0; i < tl->size; ++i) {
                scnt3[tl->list[tl->indices[i]]] += tl->indices[i + 1] - tl->indices[i] + 1;
                // gid, size, list
            }
        } else {
            // gid, list size, list
            scnt3[tl->rank] += tl->size + 2;
        }
        if (use_phase2) {
            // The phase 1 header has as its third element, the
            // total list size (needed for conservation);
            scnt3[tl->rank] += 1;
        }
    }
    const auto sdispl4 = newoffset(scnt3);
    auto sdispl4_ = sdispl4;
    std::vector<int> s3(sdispl4[nhost], 0);
    // what to send to each rank
    for (const auto& gid2tar: gid2tarlist) {
        int gid = gid2tar.first;
        TarList* tl = gid2tar.second;
        if (tl->rank < 0) {
            continue;
        }
        if (tl->indices) {
            s3[sdispl4_[tl->rank]++] = gid;
            s3[sdispl4_[tl->rank]++] = tl->size;
            if (use_phase2) {
                s3[sdispl4_[tl->rank]++] = tl->indices[tl->size];
            }
            for (int i = 0; i < tl->size; ++i) {
                s3[sdispl4_[tl->rank]++] = tl->list[tl->indices[i]];
            }
            for (int i = 0; i < tl->size; ++i) {
                int rank = tl->list[tl->indices[i]];
                s3[sdispl4_[rank]++] = gid;
                assert(tl->indices[i + 1] > tl->indices[i]);
                s3[sdispl4_[rank]++] = tl->indices[i + 1] - tl->indices[i] - 1;
                for (int j = tl->indices[i] + 1; j < tl->indices[i + 1]; ++j) {
                    s3[sdispl4_[rank]++] = tl->list[j];
                }
            }
        } else {
            // gid, list size, list
            s3[sdispl4_[tl->rank]++] = gid;
            s3[sdispl4_[tl->rank]++] = tl->size;
            if (use_phase2) {
                s3[sdispl4_[tl->rank]++] = tl->size;
            }
            for (int i = 0; i < tl->size; ++i) {
                s3[sdispl4_[tl->rank]++] = tl->list[i];
            }
        }
        delete tl;
    }
    std::vector<int> r_return;
    std::vector<int> rdispl3;
    std::tie(r_return, rdispl3) = all2allv_int(s3, scnt3, sdispl4, "lists");
    return r_return;
}
}  // namespace coreneuron
#endif  // NRN_MULTISEND


================================================
FILE: coreneuron/network/netcon.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#pragma once

#include "coreneuron/mpi/nrnmpi.h"

#undef check
#if MAC
#define NetCon nrniv_Dinfo
#endif
namespace coreneuron {
class PreSyn;
class InputPreSyn;
class TQItem;
struct NrnThread;
struct Point_process;
class NetCvode;

#define DiscreteEventType 0
#define TstopEventType    1
#define NetConType        2
#define SelfEventType     3
#define PreSynType        4
#define NetParEventType   7
#define InputPreSynType   20

struct DiscreteEvent {
    DiscreteEvent() = default;
    virtual ~DiscreteEvent() = default;
    virtual void send(double deliverytime, NetCvode*, NrnThread*);
    virtual void deliver(double t, NetCvode*, NrnThread*);
    virtual int type() const {
        return DiscreteEventType;
    }
    virtual bool require_checkpoint() {
        return true;
    }
    virtual void pr(const char*, double t, NetCvode*);
};

class NetCon: public DiscreteEvent {
  public:
    bool active_{};
    double delay_{1.0};
    Point_process* target_{};
    union {
        int weight_index_{};
        int srcgid_;  // only to help InputPreSyn during setup
        // before weights are read and stored. Saves on transient
        // memory requirements by avoiding storage of all group file
        // netcon_srcgid lists. ie. that info is copied into here.
    } u;

    NetCon() = default;
    virtual ~NetCon() = default;
    virtual void send(double sendtime, NetCvode*, NrnThread*) override;
    virtual void deliver(double, NetCvode* ns, NrnThread*) override;
    virtual int type() const override {
        return NetConType;
    }
    virtual void pr(const char*, double t, NetCvode*) override;
};

class SelfEvent: public DiscreteEvent {
  public:
    double flag_;
    Point_process* target_;
    void** movable_;  // actually a TQItem**
    int weight_index_;

    SelfEvent() = default;
    virtual ~SelfEvent() = default;
    virtual void deliver(double, NetCvode*, NrnThread*) override;
    virtual int type() const override {
        return SelfEventType;
    }

    virtual void pr(const char*, double t, NetCvode*) override;

  private:
    void call_net_receive(NetCvode*);
};

class ConditionEvent: public DiscreteEvent {
  public:
    // condition detection factored out of PreSyn for re-use
    ConditionEvent() = default;
    virtual ~ConditionEvent() = default;
    virtual bool check(NrnThread*);
    virtual double value(NrnThread*) {
        return -1.;
    }

    int flag_{};  // true when below, false when above. (changed from bool to int to avoid cray acc
                  // bug(?))
};

class PreSyn: public ConditionEvent {
  public:
#if NRNMPI
    unsigned char localgid_{};  // compressed gid for spike transfer
#endif
    int nc_index_{};  // replaces dil_, index into global NetCon** netcon_in_presyn_order_
    int nc_cnt_{};    // how many netcon starting at nc_index_
    int output_index_{};
    int gid_{-1};
    double threshold_{10.};
    int thvar_index_{-1};  // >=0 points into NrnThread._actual_v
    Point_process* pntsrc_{};

    PreSyn() = default;
    virtual ~PreSyn() = default;
    virtual void send(double sendtime, NetCvode*, NrnThread*) override;
    virtual void deliver(double, NetCvode*, NrnThread*) override;
    virtual int type() const override {
        return PreSynType;
    }

    virtual double value(NrnThread*) override;
    void record(double t);
#if NRN_MULTISEND
    int multisend_index_{-1};
#endif
};

class InputPreSyn: public DiscreteEvent {
  public:
    int nc_index_{-1};  // replaces dil_, index into global NetCon** netcon_in_presyn_order_
    int nc_cnt_{};      // how many netcon starting at nc_index_

    InputPreSyn() = default;
    virtual ~InputPreSyn() = default;
    virtual void send(double sendtime, NetCvode*, NrnThread*) override;
    virtual void deliver(double, NetCvode*, NrnThread*) override;
    virtual int type() const override {
        return InputPreSynType;
    }
#if NRN_MULTISEND
    int multisend_phase2_index_{-1};
#endif
};

class NetParEvent: public DiscreteEvent {
  public:
    int ithread_;     // for pr()
    double wx_, ws_;  // exchange time and "spikes to Presyn" time

    NetParEvent();
    virtual ~NetParEvent() = default;
    virtual void send(double, NetCvode*, NrnThread*) override;
    virtual void deliver(double, NetCvode*, NrnThread*) override;
    virtual int type() const override {
        return NetParEventType;
    }

    virtual void pr(const char*, double t, NetCvode*) override;
};
}  // namespace coreneuron


================================================
FILE: coreneuron/network/netcvode.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#include <float.h>
#include <map>
#include <mutex>

#include "coreneuron/nrnconf.h"
#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/network/netcon.hpp"
#include "coreneuron/network/netcvode.hpp"
#include "coreneuron/network/netpar.hpp"
#include "coreneuron/utils/ivocvect.hpp"
#include "coreneuron/utils/profile/profiler_interface.h"
#include "coreneuron/nrniv/nrniv_decl.h"
#include "coreneuron/io/output_spikes.hpp"
#include "coreneuron/utils/nrn_assert.h"
#include "coreneuron/gpu/nrn_acc_manager.hpp"
#include "coreneuron/network/multisend.hpp"
#include "coreneuron/mechanism/membfunc.hpp"
#include "coreneuron/coreneuron.hpp"
#include "coreneuron/utils/nrnoc_aux.hpp"

namespace coreneuron {
#define PP2NT(pp) (nrn_threads + (pp)->_tid)
#define PP2t(pp)  (PP2NT(pp)->_t)
//#define POINT_RECEIVE(type, tar, w, f) (*pnt_receive[type])(tar, w, f)

double NetCvode::eps_;
NetCvode* net_cvode_instance;
bool cvode_active_;

/// Flag to use the bin queue
bool nrn_use_bin_queue_ = 0;

void mk_netcvode() {
    if (!net_cvode_instance) {
        net_cvode_instance = new NetCvode();
    }
}

#ifdef DEBUG
// temporary
static int nrn_errno_check(int type) {
    printf("nrn_errno_check() was called on pid %d: errno=%d type=%d\n", nrnmpi_myid, errno, type);
    //  assert(0);
    type = 0;
    return 1;
}
#endif

// for _OPENACC and/or NET_RECEIVE_BUFFERING
// sem 0:3 send event move
void net_sem_from_gpu(int sendtype,
                      int i_vdata,
                      int weight_index_,
                      int ith,
                      int ipnt,
                      double td,
                      double flag) {
    NrnThread& nt = nrn_threads[ith];
    Point_process* pnt = (Point_process*) nt._vdata[ipnt];
    if (sendtype == 0) {
        net_send(nt._vdata + i_vdata, weight_index_, pnt, td, flag);
    } else if (sendtype == 2) {
        net_move(nt._vdata + i_vdata, pnt, td);
    } else {
        net_event(pnt, td);
    }
}

void net_send(void** v, int weight_index_, Point_process* pnt, double td, double flag) {
    NrnThread* nt = PP2NT(pnt);
    NetCvodeThreadData& p = net_cvode_instance->p[nt->id];
    SelfEvent* se = new SelfEvent;
    se->flag_ = flag;
    se->target_ = pnt;
    se->weight_index_ = weight_index_;
    if (v >= nt->_vdata) {
        se->movable_ = v;  // needed for SaveState
    }
    assert(net_cvode_instance);
    ++p.unreffed_event_cnt_;
    if (td < nt->_t) {
        char buf[100];
        sprintf(buf, "net_send td-t = %g", td - nt->_t);
        se->pr(buf, td, net_cvode_instance);
        abort();
        hoc_execerror("net_send delay < 0", 0);
    }
    TQItem* q = net_cvode_instance->event(td, se, nt);
    if (flag == 1.0 && v >= nt->_vdata) {
        *v = (void*) q;
    }
    // printf("net_send %g %s %g %p\n", td, pnt_name(pnt), flag, *v);
}

void artcell_net_send(void** v, int weight_index_, Point_process* pnt, double td, double flag) {
    net_send(v, weight_index_, pnt, td, flag);
}

void net_event(Point_process* pnt, double time) {
    NrnThread* nt = PP2NT(pnt);
    PreSyn* ps = nt->presyns +
                 nt->pnt2presyn_ix[corenrn.get_pnttype2presyn()[pnt->_type]][pnt->_i_instance];
    if (ps) {
        if (time < nt->_t) {
            char buf[100];
            sprintf(buf, "net_event time-t = %g", time - nt->_t);
            ps->pr(buf, time, net_cvode_instance);
            hoc_execerror("net_event time < t", 0);
        }
        ps->send(time, net_cvode_instance, nt);
    }
}

NetCvodeThreadData::NetCvodeThreadData()
    : tqe_{new TQueue<QTYPE>()} {
    inter_thread_events_.reserve(1000);
}

NetCvodeThreadData::~NetCvodeThreadData() {
    delete tqe_;
}

/// If the PreSyn is on a different thread than the target,
/// we have to lock the buffer
void NetCvodeThreadData::interthread_send(double td, DiscreteEvent* db, NrnThread* /* nt */) {
    std::lock_guard<OMP_Mutex> lock(mut);
    inter_thread_events_.emplace_back(InterThreadEvent{db, td});
}

void interthread_enqueue(NrnThread* nt) {
    net_cvode_instance->p[nt->id].enqueue(net_cvode_instance, nt);
}

void NetCvodeThreadData::enqueue(NetCvode* nc, NrnThread* nt) {
    std::lock_guard<OMP_Mutex> lock(mut);
    for (const auto& ite: inter_thread_events_) {
        nc->bin_event(ite.t_, ite.de_, nt);
    }
    inter_thread_events_.clear();
}

NetCvode::NetCvode() {
    eps_ = 100. * DBL_EPSILON;
#if PRINT_EVENT
    print_event_ = 1;
#else
    print_event_ = 0;
#endif
    pcnt_ = 0;
    p = nullptr;
    p_construct(1);
    // eventually these should not have to be thread safe
    // for parallel network simulations hardly any presyns have
    // a threshold and it can be very inefficient to check the entire
    // presyn list for thresholds during the fixed step method.
    // So keep a threshold list.
}

NetCvode::~NetCvode() {
    if (net_cvode_instance == this) {
        net_cvode_instance = nullptr;
    }

    p_construct(0);
}

void nrn_p_construct() {
    net_cvode_instance->p_construct(nrn_nthread);
}

void NetCvode::p_construct(int n) {
    if (pcnt_ != n) {
        if (p) {
            delete[] p;
            p = nullptr;
        }

        if (n > 0)
            p = new NetCvodeThreadData[n];
        else
            p = nullptr;

        pcnt_ = n;
    }

    for (int i = 0; i < n; ++i)
        p[i].unreffed_event_cnt_ = 0;
}

TQItem* NetCvode::bin_event(double td, DiscreteEvent* db, NrnThread* nt) {
    if (nrn_use_bin_queue_) {
#if PRINT_EVENT
        if (print_event_) {
            db->pr("binq send", td, this);
        }
#endif
        return p[nt->id].tqe_->enqueue_bin(td, db);
    } else {
#if PRINT_EVENT
        if (print_event_) {
            db->pr("send", td, this);
        }
#endif
        return p[nt->id].tqe_->insert(td, db);
    }
}

TQItem* NetCvode::event(double td, DiscreteEvent* db, NrnThread* nt) {
#if PRINT_EVENT
    if (print_event_) {
        db->pr("send", td, this);
    }
#endif
    return p[nt->id].tqe_->insert(td, db);
}

void NetCvode::clear_events() {
    // DiscreteEvents may already have gone out of existence so the tqe_
    // may contain many invalid item data pointers
    enqueueing_ = 0;
    for (int i = 0; i < nrn_nthread; ++i) {
        NetCvodeThreadData& d = p[i];
        delete d.tqe_;
        d.tqe_ = new TQueue<QTYPE>();
        d.unreffed_event_cnt_ = 0;
        d.inter_thread_events_.clear();
        d.tqe_->nshift_ = -1;
        d.tqe_->shift_bin(nrn_threads->_t - 0.5 * nrn_threads->_dt);
    }
}

void NetCvode::init_events() {
    for (int i = 0; i < nrn_nthread; ++i) {
        p[i].tqe_->nshift_ = -1;
        p[i].tqe_->shift_bin(nrn_threads->_t - 0.5 * nrn_threads->_dt);
    }

    for (int tid = 0; tid < nrn_nthread; ++tid) {  // can be done in parallel
        NrnThread* nt = nrn_threads + tid;

        for (int ipre = 0; ipre < nt->n_presyn; ++ipre) {
            PreSyn* ps = nt->presyns + ipre;
            ps->flag_ = false;
        }

        for (int inetc = 0; inetc < nt->n_netcon; ++inetc) {
            NetCon* d = nt->netcons + inetc;
            if (d->target_) {
                int type = d->target_->_type;
                if (corenrn.get_pnt_receive_init()[type]) {
                    (*corenrn.get_pnt_receive_init()[type])(d->target_, d->u.weight_index_, 0);
                } else {
                    int cnt = corenrn.get_pnt_receive_size()[type];
                    double* wt = nt->weights + d->u.weight_index_;
                    // not the first
                    for (int j = 1; j < cnt; ++j) {
                        wt[j] = 0.;
                    }
                }
            }
        }
    }
}

bool NetCvode::deliver_event(double til, NrnThread* nt) {
    TQItem* q = p[nt->id].tqe_->atomic_dq(til);
    if (q == nullptr) {
        return false;
    }

    DiscreteEvent* de = q->data_;
    double tt = q->t_;
    delete q;
#if PRINT_EVENT
    if (print_event_) {
        de->pr("deliver", tt, this);
    }
#endif
    de->deliver(tt, this, nt);

    /// In case of a self event we need to delete the self event
    if (de->type() == SelfEventType) {
        delete static_cast<SelfEvent*>(de);
    }
    return true;
}

void net_move(void** v, Point_process* pnt, double tt) {
    // assert, if possible that *v == pnt->movable.
    if (!(*v))
        hoc_execerror("No event with flag=1 for net_move in ",
                      corenrn.get_memb_func(pnt->_type).sym);

    TQItem* q = (TQItem*) (*v);
    // printf("net_move tt=%g %s *v=%p\n", tt, memb_func[pnt->_type].sym, *v);
    if (tt < PP2t(pnt))
        nrn_assert(0);

    net_cvode_instance->move_event(q, tt, PP2NT(pnt));
}

void artcell_net_move(void** v, Point_process* pnt, double tt) {
    net_move(v, pnt, tt);
}

void NetCvode::move_event(TQItem* q, double tnew, NrnThread* nt) {
    int tid = nt->id;

#if PRINT_EVENT
    if (print_event_) {
        SelfEvent* se = (SelfEvent*) q->data_;
        printf("NetCvode::move_event self event target %s t=%g, old=%g new=%g\n",
               corenrn.get_memb_func(se->target_->_type).sym,
               nt->_t,
               q->t_,
               tnew);
    }
#endif

    p[tid].tqe_->move(q, tnew);
}

void NetCvode::deliver_events(double til, NrnThread* nt) {
    // printf("deliver_events til %20.15g\n", til);
    /// Enqueue any outstanding events in the interthread event buffer
    p[nt->id].enqueue(this, nt);

    /// Deliver events. When the map is used, the loop is explicit
    while (deliver_event(til, nt))
        ;
}

void PreSyn::record(double tt) {
    spikevec_lock();
    if (gid_ > -1) {
        spikevec_gid.push_back(gid_);
        spikevec_time.push_back(tt);
    }
    spikevec_unlock();
}

bool ConditionEvent::check(NrnThread* nt) {
    if (value(nt) > 0.0) {
        if (flag_ == false) {
            flag_ = true;
            return true;
        }
    } else {
        flag_ = false;
    }
    return false;
}

void DiscreteEvent::send(double tt, NetCvode* ns, NrnThread* nt) {
    ns->event(tt, this, nt);
}

void DiscreteEvent::deliver(double /* tt */, NetCvode* /* ns */, NrnThread* /* nt */) {}

void DiscreteEvent::pr(const char* s, double tt, NetCvode* /* ns */) {
    printf("%s DiscreteEvent %.15g\n", s, tt);
}

void NetCon::send(double tt, NetCvode* ns, NrnThread* nt) {
    if (active_ && target_) {
        nrn_assert(PP2NT(target_) == nt);
        ns->bin_event(tt, this, PP2NT(target_));
    }
}

void NetCon::deliver(double tt, NetCvode* /* ns */, NrnThread* nt) {
    nrn_assert(target_);

    if (PP2NT(target_) != nt)
        printf("NetCon::deliver nt=%d target=%d\n", nt->id, PP2NT(target_)->id);

    nrn_assert(PP2NT(target_) == nt);
    int typ = target_->_type;
    nt->_t = tt;

    // printf("NetCon::deliver t=%g tt=%g %s\n", t, tt, pnt_name(target_));
    std::string ss("net-receive-");
    ss += nrn_get_mechname(typ);
    Instrumentor::phase p_get_pnt_receive(ss.c_str());
    (*corenrn.get_pnt_receive()[typ])(target_, u.weight_index_, 0);
#ifdef DEBUG
    if (errno && nrn_errno_check(typ))
        hoc_warning("errno set during NetCon deliver to NET_RECEIVE", (char*) 0);
#endif
}

void NetCon::pr(const char* s, double tt, NetCvode* /* ns */) {
    Point_process* pp = target_;
    printf("%s NetCon target=%s[%d] %.15g\n",
           s,
           corenrn.get_memb_func(pp->_type).sym,
           pp->_i_instance,
           tt);
}

void PreSyn::send(double tt, NetCvode* ns, NrnThread* nt) {
    record(tt);
    for (int i = nc_cnt_ - 1; i >= 0; --i) {
        NetCon* d = netcon_in_presyn_order_[nc_index_ + i];
        if (d->active_ && d->target_) {
            NrnThread* n = PP2NT(d->target_);

            if (nt == n)
                ns->bin_event(tt + d->delay_, d, n);
            else
                ns->p[n->id].interthread_send(tt + d->delay_, d, n);
        }
    }

#if NRNMPI
    if (output_index_ >= 0) {
#if NRN_MULTISEND
        if (use_multisend_) {
            nrn_multisend_send(this, tt, nt);
        } else {
#else
        {
#endif
            if (nrn_use_localgid_) {
                nrn_outputevent(localgid_, tt);
            } else {
                nrn2ncs_outputevent(output_index_, tt);
            }
        }
    }
#endif  // NRNMPI
}

void InputPreSyn::send(double tt, NetCvode* ns, NrnThread* nt) {
    for (int i = nc_cnt_ - 1; i >= 0; --i) {
        NetCon* d = netcon_in_presyn_order_[nc_index_ + i];
        if (d->active_ && d->target_) {
            NrnThread* n = PP2NT(d->target_);

            if (nt == n)
                ns->bin_event(tt + d->delay_, d, n);
            else
                ns->p[n->id].interthread_send(tt + d->delay_, d, n);
        }
    }
}

void PreSyn::deliver(double, NetCvode*, NrnThread*) {
    assert(0);  // no PreSyn delay.
}

void InputPreSyn::deliver(double, NetCvode*, NrnThread*) {
    assert(0);  // no InputPreSyn delay.
}

void SelfEvent::deliver(double tt, NetCvode* ns, NrnThread* nt) {
    nrn_assert(nt == PP2NT(target_));
    PP2t(target_) = tt;
    // printf("SelfEvent::deliver t=%g tt=%g %s\n", PP2t(target_), tt, pnt_name(target_));
    call_net_receive(ns);
}

void SelfEvent::call_net_receive(NetCvode* ns) {
    (*corenrn.get_pnt_receive()[target_->_type])(target_, weight_index_, flag_);

#ifdef DEBUG
    if (errno && nrn_errno_check(target_->_type))
        hoc_warning("errno set during SelfEvent deliver to NET_RECEIVE", (char*) 0);
#endif

    NetCvodeThreadData& nctd = ns->p[PP2NT(target_)->id];
    --nctd.unreffed_event_cnt_;
}

void SelfEvent::pr(const char* s, double tt, NetCvode*) {
    printf("%s", s);
    printf(" SelfEvent target=%s %.15g flag=%g\n", pnt_name(target_), tt, flag_);
}

void ncs2nrn_integrate(double tstop) {
    int total_sim_steps = static_cast<int>((tstop - nrn_threads->_t) / dt + 1e-9);

    if (total_sim_steps > 3 && !nrn_have_gaps) {
        nrn_fixed_step_group_minimal(total_sim_steps);
    } else {
        nrn_fixed_single_steps_minimal(total_sim_steps, tstop);
    }

    // handle all the pending flag=1 self events
    for (int i = 0; i < nrn_nthread; ++i)
        nrn_assert(nrn_threads[i]._t == nrn_threads->_t);
}

// factored this out from deliver_net_events so we can
// stay in the cache
// net_send_buffer added so checking can be done on gpu
// while event queueing is on cpu.
// Remember: passsing reference variable causes cray
// compiler bug

static bool pscheck(double var, double thresh, int* flag) {
    if (var > thresh) {
        if (*flag == false) {
            *flag = true;
            return true;
        }
    } else {
        *flag = false;
    }
    return false;
}

double PreSyn::value(NrnThread* nt) {
    return nt->_actual_v[thvar_index_] - threshold_;
}

void NetCvode::check_thresh(NrnThread* nt) {  // for default method
    Instrumentor::phase p("check-threshold");
    double teps = 1e-10;

    nt->_net_send_buffer_cnt = 0;
    int net_send_buf_count = 0;
    PreSyn* presyns = nt->presyns;
    PreSynHelper* presyns_helper = nt->presyns_helper;
    double* actual_v = nt->_actual_v;

    if (nt->ncell == 0)
        return;

    nrn_pragma_acc(parallel loop present(
        nt [0:1], presyns_helper [0:nt->n_presyn], presyns [0:nt->n_presyn], actual_v [0:nt->end])
                       copy(net_send_buf_count) if (nt->compute_gpu) async(nt->stream_id))
    nrn_pragma_omp(target teams distribute parallel for map(tofrom: net_send_buf_count) if(nt->compute_gpu))
    for (int i = 0; i < nt->n_real_output; ++i) {
        PreSyn* ps = presyns + i;
        PreSynHelper* psh = presyns_helper + i;
        int idx = 0;
        int thidx = ps->thvar_index_;
        double v = actual_v[thidx];
        double threshold = ps->threshold_;
        int* flag = &(psh->flag_);

        if (pscheck(v, threshold, flag)) {
#ifndef CORENEURON_ENABLE_GPU
            nt->_net_send_buffer_cnt = net_send_buf_count;
            if (nt->_net_send_buffer_cnt >= nt->_net_send_buffer_size) {
                nt->_net_send_buffer_size *= 2;
                nt->_net_send_buffer = (int*) erealloc(nt->_net_send_buffer,
                                                       nt->_net_send_buffer_size * sizeof(int));
            }
#endif

            nrn_pragma_acc(atomic capture)
            nrn_pragma_omp(atomic capture)
            idx = net_send_buf_count++;

            nt->_net_send_buffer[idx] = i;
        }
    }
    nrn_pragma_acc(wait(nt->stream_id))
    nt->_net_send_buffer_cnt = net_send_buf_count;

    if (nt->compute_gpu && nt->_net_send_buffer_cnt) {
#ifdef CORENEURON_ENABLE_GPU
        int* nsbuffer = nt->_net_send_buffer;
#endif
        nrn_pragma_acc(update host(nsbuffer [0:nt->_net_send_buffer_cnt]) async(nt->stream_id))
        nrn_pragma_acc(wait(nt->stream_id))
        nrn_pragma_omp(target update from(nsbuffer [0:nt->_net_send_buffer_cnt]))
    }

    // on CPU...
    for (int i = 0; i < nt->_net_send_buffer_cnt; ++i) {
        PreSyn* ps = nt->presyns + nt->_net_send_buffer[i];
        ps->send(nt->_t + teps, net_cvode_instance, nt);
    }

    // Types that have WATCH statements. If exist, then last element is 0.
    if (nt->_watch_types) {
        for (int i = 0; nt->_watch_types[i] != 0; ++i) {
            int type = nt->_watch_types[i];
            (*corenrn.get_watch_check()[type])(nt, nt->_ml_list[type]);
            // may generate net_send events (with 0 (teps) delay)
        }
    }
}

// WATCH statements are rare. Conceptually they are very similar to
// PreSyn thresholds as above but an optimal peformance implementation for GPU is
// not obvious. Each WATCH statement threshold test could make use of
// pscheck.  Note that it is possible that there are several active WATCH
// statements for a given POINT_PROCESS instance as well as none active.
// Also WATCH statements switch between active and inactive state.
//
// In NEURON,
// both PreSyn and WatchCondition were subclasses of ConditionEvent. When
// a WatchCondition fired in the fixed step method, it was placed on the queue
// with a delivery time of t+teps. WatchCondition::deliver called the NET_RECEIVE
// block with proper flag ( but nullptr weight vector). WatchConditions
// were created,added/removed,destroyed from a list as necessary.
// Perhaps the most commonly used WATCH statement is in the context of a
// ThresholdDetect Point_process which watches voltage and compares to
// an instance specific threshold parameter. A firing ThresholdDetect instance
// would call net_event(tdeliver) which then feeds into the standard
// artcell PreSyn sequence (using pntsrc_ instead of thvar_index_).
//
// So... the PreSyns have the same order as they are checked (although PreSyn
// data is AoS instead of SoA and nested 'if' means a failure of SIMD.)
// But if multiple WATCH, there is (from one kind of implementation viewpoint),
// yet another 'if' with regard to whether a WATCH is active. And if there
// are multiple WATCH, the size of the list is dynamic.
//
// An experimental implementation is to check all WATCH of all instances
// of a type with the proviso that there is an active flag for each WATCH.
// ie. active, below, var1, var2 are all SoA (except one of the var may
// be voltage). Can use 'if (active && pscheck(var1, var2, &below)'
// The mod file net_send_buffering fragments can be used which
// ultimately call net_send using a transient SelfEvent. ie. all
// checking computation takes place in the context of the mod file without
// using explicit WatchCondition instances.

// events including binqueue events up to t+dt/2
void NetCvode::deliver_net_events(NrnThread* nt) {  // for default method
#if NRN_MULTISEND
    if (use_multisend_ && nt->id == 0) {
        nrn_multisend_advance();
    }
#endif
    int tid = nt->id;
    double tsav = nt->_t;
    double tm = nt->_t + 0.5 * nt->_dt;
tryagain:
    // one of the events on the main queue may be a NetParEvent
    // which due to dt round off error can result in an event
    // placed on the bin queue to be delivered now, which
    // can put 0 delay events on to the main queue. So loop til
    // no events. The alternative would be to deliver an idt=0 event
    // immediately but that would very much change the sequence
    // with respect to what is being done here and it is unclear
    // how to fix the value of t there. This can be a do while loop
    // but I do not want to affect the case of not using a bin queue.

    if (nrn_use_bin_queue_) {
        TQItem* q;
        while ((q = p[tid].tqe_->dequeue_bin()) != 0) {
            DiscreteEvent* db = q->data_;

#if PRINT_EVENT
            if (print_event_) {
                db->pr("binq deliver", nrn_threads->_t, this);
            }
#endif

            delete q;
            db->deliver(nt->_t, this, nt);
        }
        // assert(int(tm/nt->_dt)%1000 == p[tid].tqe_->nshift_);
    }

    deliver_events(tm, nt);

    if (nrn_use_bin_queue_) {
        if (p[tid].tqe_->top()) {
            goto tryagain;
        }
        p[tid].tqe_->shift_bin(tm);
    }

    nt->_t = tsav;

    /*before executing on gpu, we have to update the NetReceiveBuffer_t on GPU */
    update_net_receive_buffer(nt);

    for (auto& net_buf_receive: corenrn.get_net_buf_receive()) {
        std::string ss("net-buf-receive-");
        ss += nrn_get_mechname(net_buf_receive.second);
        Instrumentor::phase p_net_buf_receive(ss.c_str());
        (*net_buf_receive.first)(nt);
    }
}
}  // namespace coreneuron


================================================
FILE: coreneuron/network/netcvode.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#pragma once

#include "coreneuron/utils/nrnmutdec.hpp"
#include "coreneuron/network/tqueue.hpp"

#define PRINT_EVENT 0

/** QTYPE options include: spltree, pq_que
 *  STL priority queue is used instead of the splay tree by default.
 *  @todo: check if stl queue works with move_event functions.
 */

#ifdef ENABLE_SPLAYTREE_QUEUING
#define QTYPE spltree
#else
#define QTYPE pq_que
#endif
namespace coreneuron {

// defined in coreneuron/network/cvodestb.cpp
extern void init_net_events(void);
extern void nrn_play_init(void);
extern void deliver_net_events(NrnThread*);
extern void nrn_deliver_events(NrnThread*);
extern void fixed_play_continuous(NrnThread*);

struct DiscreteEvent;
class NetCvode;

extern NetCvode* net_cvode_instance;
extern void interthread_enqueue(NrnThread*);

struct InterThreadEvent {
    DiscreteEvent* de_;
    double t_;
};

class NetCvodeThreadData {
  public:
    int unreffed_event_cnt_ = 0;
    TQueue<QTYPE>* tqe_;
    std::vector<InterThreadEvent> inter_thread_events_;
    OMP_Mutex mut;

    NetCvodeThreadData();
    virtual ~NetCvodeThreadData();
    void interthread_send(double, DiscreteEvent*, NrnThread*);
    void enqueue(NetCvode*, NrnThread*);
};

class NetCvode {
  public:
    int print_event_;
    int pcnt_;
    int enqueueing_;
    NetCvodeThreadData* p;
    static double eps_;

    NetCvode(void);
    virtual ~NetCvode();
    void p_construct(int);
    void check_thresh(NrnThread*);
    static double eps(double x) {
        return eps_ * fabs(x);
    }
    TQItem* event(double tdeliver, DiscreteEvent*, NrnThread*);
    void move_event(TQItem*, double, NrnThread*);
    TQItem* bin_event(double tdeliver, DiscreteEvent*, NrnThread*);
    void deliver_net_events(NrnThread*);          // for default staggered time step method
    void deliver_events(double til, NrnThread*);  // for initialization events
    bool deliver_event(double til, NrnThread*);   // uses TQueue atomically
    void clear_events();
    void init_events();
    void point_receive(int, Point_process*, double*, double);
};
}  // namespace coreneuron


================================================
FILE: coreneuron/network/netpar.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#include <cstdio>
#include <cstdlib>
#include <map>
#include <mutex>
#include <vector>

#include "coreneuron/nrnconf.h"
#include "coreneuron/apps/corenrn_parameters.hpp"
#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/mpi/nrnmpidec.h"

#include "coreneuron/network/netcon.hpp"
#include "coreneuron/network/netcvode.hpp"
#include "coreneuron/nrniv/nrniv_decl.h"
#include "coreneuron/utils/ivocvect.hpp"
#include "coreneuron/network/multisend.hpp"
#include "coreneuron/utils/nrn_assert.h"
#include "coreneuron/utils/nrnoc_aux.hpp"
#include "coreneuron/utils/profile/profiler_interface.h"
#include "coreneuron/utils/utils.hpp"

#if NRNMPI
#include "coreneuron/mpi/nrnmpi.h"
#include "coreneuron/mpi/core/nrnmpi.hpp"
int localgid_size_;
int ag_send_nspike;
namespace coreneuron {
int* nrnmpi_nin_;
}
int ovfl_capacity;
int icapacity;
unsigned char* spikeout_fixed;
unsigned char* spfixin_ovfl_;
unsigned char* spikein_fixed;
int ag_send_size;
int ovfl;
int nout;
coreneuron::NRNMPI_Spikebuf* spbufout;
coreneuron::NRNMPI_Spikebuf* spbufin;
#endif

namespace coreneuron {
class PreSyn;
class InputPreSyn;

void nrn_spike_exchange_init();

#if NRNMPI
static double t_exchange_;
static double dt1_;  // 1/dt

NRNMPI_Spike* spikeout;
NRNMPI_Spike* spikein;

void nrn_timeout(int);
void nrn_spike_exchange(NrnThread*);
void nrn2ncs_outputevent(int netcon_output_index, double firetime);

// for compressed gid info during spike exchange
bool nrn_use_localgid_;
void nrn_outputevent(unsigned char localgid, double firetime);
std::vector<std::map<int, InputPreSyn*>> localmaps;

static int ocapacity_;  // for spikeout
// require it to be smaller than  min_interprocessor_delay.
static double wt_;   // wait time for nrnmpi_spike_exchange
static double wt1_;  // time to find the PreSyns and send the spikes.
static bool use_compress_;
static int spfixout_capacity_;
static int idxout_;
static void nrn_spike_exchange_compressed(NrnThread*);

#endif  // NRNMPI

static bool active_ = false;
static double usable_mindelay_;
static double mindelay_;  // the one actually used. Some of our optional algorithms
static double last_maxstep_arg_;
static std::vector<NetParEvent> npe_;  // nrn_nthread of them

#if NRNMPI
// for combination of threads and mpi.
static OMP_Mutex mut;
#endif

/// Allocate space for spikes: 200 structs of {int gid; double time}
/// coming from nrnmpi.h and array of int of the global domain size
static void alloc_mpi_space() {
#if NRNMPI
    if (corenrn_param.mpi_enable && !spikeout) {
        ocapacity_ = 100;
        spikeout = (NRNMPI_Spike*) emalloc(ocapacity_ * sizeof(NRNMPI_Spike));
        icapacity = 100;
        spikein = (NRNMPI_Spike*) malloc(icapacity * sizeof(NRNMPI_Spike));
        nrnmpi_nin_ = (int*) emalloc(nrnmpi_numprocs * sizeof(int));
#if nrn_spikebuf_size > 0
        spbufout = (NRNMPI_Spikebuf*) emalloc(sizeof(NRNMPI_Spikebuf));
        spbufin = (NRNMPI_Spikebuf*) emalloc(nrnmpi_numprocs * sizeof(NRNMPI_Spikebuf));
#endif
    }
#endif
}

NetParEvent::NetParEvent()
    : ithread_(-1)
    , wx_(0.)
    , ws_(0.) {}

void NetParEvent::send(double tt, NetCvode* nc, NrnThread* nt) {
    nc->event(tt + usable_mindelay_, this, nt);
}

void NetParEvent::deliver(double tt, NetCvode* nc, NrnThread* nt) {
    net_cvode_instance->deliver_events(tt, nt);
    nt->_stop_stepping = 1;
    nt->_t = tt;
    send(tt, nc, nt);
}

void NetParEvent::pr(const char* m, double tt, NetCvode*) {
    printf("%s NetParEvent %d t=%.15g tt-t=%g\n", m, ithread_, tt, tt - nrn_threads[ithread_]._t);
}

#if NRNMPI
inline static void sppk(unsigned char* c, int gid) {
    for (int i = localgid_size_ - 1; i >= 0; --i) {
        c[i] = gid & 255;
        gid >>= 8;
    }
}
inline static int spupk(unsigned char* c) {
    int gid = *c++;
    for (int i = 1; i < localgid_size_; ++i) {
        gid <<= 8;
        gid += *c++;
    }
    return gid;
}

void nrn_outputevent(unsigned char localgid, double firetime) {
    if (!active_) {
        return;
    }
    std::lock_guard<OMP_Mutex> lock(mut);
    nout++;
    int i = idxout_;
    idxout_ += 2;
    if (idxout_ >= spfixout_capacity_) {
        spfixout_capacity_ *= 2;
        spikeout_fixed = (unsigned char*) erealloc(spikeout_fixed,
                                                   spfixout_capacity_ * sizeof(unsigned char));
    }
    spikeout_fixed[i++] = (unsigned char) ((firetime - t_exchange_) * dt1_ + .5);
    spikeout_fixed[i] = localgid;
    // printf("%d idx=%d lgid=%d firetime=%g t_exchange_=%g [0]=%d [1]=%d\n", nrnmpi_myid, i,
    // (int)localgid, firetime, t_exchange_, (int)spikeout_fixed[i-1], (int)spikeout_fixed[i]);
}

void nrn2ncs_outputevent(int gid, double firetime) {
    if (!active_) {
        return;
    }
    std::lock_guard<OMP_Mutex> lock(mut);
    if (use_compress_) {
        nout++;
        int i = idxout_;
        idxout_ += 1 + localgid_size_;
        if (idxout_ >= spfixout_capacity_) {
            spfixout_capacity_ *= 2;
            spikeout_fixed = (unsigned char*) erealloc(spikeout_fixed,
                                                       spfixout_capacity_ * sizeof(unsigned char));
        }
        // printf("%d nrnncs_outputevent %d %.20g %.20g %d\n", nrnmpi_myid, gid, firetime,
        // t_exchange_,
        //(int)((unsigned char)((firetime - t_exchange_)*dt1_ + .5)));
        spikeout_fixed[i++] = (unsigned char) ((firetime - t_exchange_) * dt1_ + .5);
        // printf("%d idx=%d firetime=%g t_exchange_=%g spfixout=%d\n", nrnmpi_myid, i, firetime,
        // t_exchange_, (int)spikeout_fixed[i-1]);
        sppk(spikeout_fixed + i, gid);
        // printf("%d idx=%d gid=%d spupk=%d\n", nrnmpi_myid, i, gid, spupk(spikeout_fixed+i));
    } else {
#if nrn_spikebuf_size == 0
        int i = nout++;
        if (i >= ocapacity_) {
            ocapacity_ *= 2;
            spikeout = (NRNMPI_Spike*) erealloc(spikeout, ocapacity_ * sizeof(NRNMPI_Spike));
        }
        // printf("%d cell %d in slot %d fired at %g\n", nrnmpi_myid, gid, i, firetime);
        spikeout[i].gid = gid;
        spikeout[i].spiketime = firetime;
#else
        int i = nout++;
        if (i >= nrn_spikebuf_size) {
            i -= nrn_spikebuf_size;
            if (i >= ocapacity_) {
                ocapacity_ *= 2;
                spikeout = (NRNMPI_Spike*) hoc_Erealloc(spikeout,
                                                        ocapacity_ * sizeof(NRNMPI_Spike));
                hoc_malchk();
            }
            spikeout[i].gid = gid;
            spikeout[i].spiketime = firetime;
        } else {
            spbufout->gid[i] = gid;
            spbufout->spiketime[i] = firetime;
        }
#endif
    }
    // printf("%d cell %d in slot %d fired at %g\n", nrnmpi_myid, gid, i, firetime);
}
#endif  // NRNMPI

static bool nrn_need_npe() {
    if (active_ || nrn_nthread > 1) {
        if (last_maxstep_arg_ == 0) {
            last_maxstep_arg_ = 100.;
        }
        return true;
    } else {
        if (!npe_.empty()) {
            npe_.clear();
            npe_.shrink_to_fit();
        }
        return false;
    }
}

#define TBUFSIZE 0

void nrn_spike_exchange_init() {
    // printf("nrn_spike_exchange_init\n");
    if (!nrn_need_npe()) {
        return;
    }
    alloc_mpi_space();
    usable_mindelay_ = mindelay_;
#if NRN_MULTISEND
    if (use_multisend_ && n_multisend_interval == 2) {
        usable_mindelay_ *= 0.5;
    }
#endif
    if (nrn_nthread > 1) {
        usable_mindelay_ -= dt;
    }
    if ((usable_mindelay_ < 1e-9) || (usable_mindelay_ < dt)) {
        if (nrnmpi_myid == 0) {
            hoc_execerror("usable mindelay is 0", "(or less than dt for fixed step method)");
        } else {
            return;
        }
    }

#if TBUFSIZE
    itbuf_ = 0;
#endif

#if NRN_MULTISEND
    if (use_multisend_) {
        nrn_multisend_init();
    }
#endif

    if (npe_.size() != static_cast<std::size_t>(nrn_nthread)) {
        if (!npe_.empty()) {
            npe_.clear();
            npe_.shrink_to_fit();
        }
        npe_.resize(nrn_nthread);
    }
    for (int i = 0; i < nrn_nthread; ++i) {
        npe_[i].ithread_ = i;
        npe_[i].wx_ = 0.;
        npe_[i].ws_ = 0.;
        npe_[i].send(t, net_cvode_instance, nrn_threads + i);
    }
#if NRNMPI
    if (corenrn_param.mpi_enable) {
        if (use_compress_) {
            idxout_ = 2;
            t_exchange_ = t;
            dt1_ = rev_dt;
            usable_mindelay_ = floor(mindelay_ * dt1_ + 1e-9) * dt;
            if (usable_mindelay_ * dt1_ >= 255.) {
                usable_mindelay_ = 255. / dt1_;
            }
            assert(usable_mindelay_ >= dt && (usable_mindelay_ * dt1_) <= 255.);
        } else {
#if nrn_spikebuf_size > 0
            if (spbufout) {
                spbufout->nspike = 0;
            }
#endif
        }
        nout = 0;
    }
#endif  // NRNMPI
        // if (nrnmpi_myid == 0){printf("usable_mindelay_ = %g\n", usable_mindelay_);}
}

#if NRNMPI
void nrn_spike_exchange(NrnThread* nt) {
    Instrumentor::phase p_spike_exchange("spike-exchange");
    if (!active_) {
        return;
    }
#if NRN_MULTISEND
    if (use_multisend_) {
        nrn_multisend_receive(nt);
        return;
    }
#endif
    if (use_compress_) {
        nrn_spike_exchange_compressed(nt);
        return;
    }
#if TBUFSIZE
    nrnmpi_barrier();
#endif

#if nrn_spikebuf_size > 0
    spbufout->nspike = nout;
#endif
    double wt = nrn_wtime();

    int n = nrnmpi_spike_exchange(
        nrnmpi_nin_, spikeout, icapacity, &spikein, ovfl, nout, spbufout, spbufin);

    wt_ = nrn_wtime() - wt;
    wt = nrn_wtime();
#if TBUFSIZE
    tbuf_[itbuf_++] = (unsigned long) nout;
    tbuf_[itbuf_++] = (unsigned long) n;
#endif

    errno = 0;
    // if (n > 0) {
    // printf("%d nrn_spike_exchange sent %d received %d\n", nrnmpi_myid, nout, n);
    //}
    nout = 0;
    if (n == 0) {
        return;
    }
#if nrn_spikebuf_size > 0
    for (int i = 0; i < nrnmpi_numprocs; ++i) {
        int nn = spbufin[i].nspike;
        if (nn > nrn_spikebuf_size) {
            nn = nrn_spikebuf_size;
        }
        for (int j = 0; j < nn; ++j) {
            auto gid2in_it = gid2in.find(spbufin[i].gid[j]);
            if (gid2in_it != gid2in.end()) {
                InputPreSyn* ps = gid2in_it->second;
                ps->send(spbufin[i].spiketime[j], net_cvode_instance, nt);
            }
        }
    }
    n = ovfl;
#endif  // nrn_spikebuf_size > 0
    for (int i = 0; i < n; ++i) {
        auto gid2in_it = gid2in.find(spikein[i].gid);
        if (gid2in_it != gid2in.end()) {
            InputPreSyn* ps = gid2in_it->second;
            ps->send(spikein[i].spiketime, net_cvode_instance, nt);
        }
    }
    nrn_multithread_job(interthread_enqueue);
    wt1_ = nrn_wtime() - wt;
}

void nrn_spike_exchange_compressed(NrnThread* nt) {
    if (!active_) {
        return;
    }
#if TBUFSIZE
    nrnmpi_barrier();
#endif

    assert(nout < 0x10000);
    spikeout_fixed[1] = (unsigned char) (nout & 0xff);
    spikeout_fixed[0] = (unsigned char) (nout >> 8);

    double wt = nrn_wtime();

    int n = nrnmpi_spike_exchange_compressed(localgid_size_,
                                             spfixin_ovfl_,
                                             ag_send_nspike,
                                             nrnmpi_nin_,
                                             ovfl_capacity,
                                             spikeout_fixed,
                                             ag_send_size,
                                             spikein_fixed,
                                             ovfl);
    wt_ = nrn_wtime() - wt;
    wt = nrn_wtime();
#if TBUFSIZE
    tbuf_[itbuf_++] = (unsigned long) nout;
    tbuf_[itbuf_++] = (unsigned long) n;
#endif
    errno = 0;
    // if (n > 0) {
    // printf("%d nrn_spike_exchange sent %d received %d\n", nrnmpi_myid, nout, n);
    //}
    nout = 0;
    idxout_ = 2;
    if (n == 0) {
        t_exchange_ = nrn_threads->_t;
        return;
    }
    if (nrn_use_localgid_) {
        int idxov = 0;
        for (int i = 0; i < nrnmpi_numprocs; ++i) {
            int j, nnn;
            int nn = nrnmpi_nin_[i];
            if (nn) {
                if (i == nrnmpi_myid) {  // skip but may need to increment idxov.
                    if (nn > ag_send_nspike) {
                        idxov += (nn - ag_send_nspike) * (1 + localgid_size_);
                    }
                    continue;
                }
                std::map<int, InputPreSyn*> gps = localmaps[i];
                if (nn > ag_send_nspike) {
                    nnn = ag_send_nspike;
                } else {
                    nnn = nn;
                }
                int idx = 2 + i * ag_send_size;
                for (j = 0; j < nnn; ++j) {
                    // order is (firetime,gid) pairs.
                    double firetime = spikein_fixed[idx++] * dt + t_exchange_;
                    int lgid = (int) spikein_fixed[idx];
                    idx += localgid_size_;
                    auto gid2in_it = gps.find(lgid);
                    if (gid2in_it != gps.end()) {
                        InputPreSyn* ps = gid2in_it->second;
                        ps->send(firetime + 1e-10, net_cvode_instance, nt);
                    }
                }
                for (; j < nn; ++j) {
                    double firetime = spfixin_ovfl_[idxov++] * dt + t_exchange_;
                    int lgid = (int) spfixin_ovfl_[idxov];
                    idxov += localgid_size_;
                    auto gid2in_it = gps.find(lgid);
                    if (gid2in_it != gps.end()) {
                        InputPreSyn* ps = gid2in_it->second;
                        ps->send(firetime + 1e-10, net_cvode_instance, nt);
                    }
                }
            }
        }
    } else {
        for (int i = 0; i < nrnmpi_numprocs; ++i) {
            int nn = nrnmpi_nin_[i];
            if (nn > ag_send_nspike) {
                nn = ag_send_nspike;
            }
            int idx = 2 + i * ag_send_size;
            for (int j = 0; j < nn; ++j) {
                // order is (firetime,gid) pairs.
                double firetime = spikein_fixed[idx++] * dt + t_exchange_;
                int gid = spupk(spikein_fixed + idx);
                idx += localgid_size_;
                auto gid2in_it = gid2in.find(gid);
                if (gid2in_it != gid2in.end()) {
                    InputPreSyn* ps = gid2in_it->second;
                    ps->send(firetime + 1e-10, net_cvode_instance, nt);
                }
            }
        }
        n = ovfl;
        int idx = 0;
        for (int i = 0; i < n; ++i) {
            double firetime = spfixin_ovfl_[idx++] * dt + t_exchange_;
            int gid = spupk(spfixin_ovfl_ + idx);
            idx += localgid_size_;
            auto gid2in_it = gid2in.find(gid);
            if (gid2in_it != gid2in.end()) {
                InputPreSyn* ps = gid2in_it->second;
                ps->send(firetime + 1e-10, net_cvode_instance, nt);
            }
        }
    }
    // In case of multiple threads some above ps->send events put
    // NetCon events into interthread buffers. Some of those may
    // need to be delivered early enough that the interthread buffers
    // need transfer to the thread event queues before the next dqueue_bin
    // while loop in deliver_net_events. So enqueue now...
    nrn_multithread_job(interthread_enqueue);
    t_exchange_ = nrn_threads->_t;
    wt1_ = nrn_wtime() - wt;
}

static void mk_localgid_rep() {
    // how many gids are there on this machine
    // and can they be compressed into one byte
    int ngid = 0;
    for (const auto& gid2out_elem: gid2out) {
        if (gid2out_elem.second->output_index_ >= 0) {
            ++ngid;
        }
    }

    int ngidmax = nrnmpi_int_allmax(ngid);
    if (ngidmax > 256) {
        // do not compress
        return;
    }
    localgid_size_ = sizeof(unsigned char);
    nrn_use_localgid_ = true;

    // allocate Allgather receive buffer (send is the nrnmpi_myid one)
    int* rbuf = new int[nrnmpi_numprocs * (ngidmax + 1)];
    int* sbuf = new int[ngidmax + 1];

    sbuf[0] = ngid;
    ++sbuf;
    ngid = 0;
    // define the local gid and fill with the gids on this machine
    for (const auto& gid2out_elem: gid2out) {
        if (gid2out_elem.second->output_index_ >= 0) {
            gid2out_elem.second->localgid_ = (unsigned char) ngid;
            sbuf[ngid] = gid2out_elem.second->output_index_;
            ++ngid;
        }
    }
    --sbuf;

    // exchange everything
    nrnmpi_int_allgather(sbuf, rbuf, ngidmax + 1);
    delete[] sbuf;
    errno = 0;

    // create the maps
    // there is a lot of potential for efficiency here. i.e. use of
    // perfect hash functions, or even simple Vectors.
    localmaps.clear();
    localmaps.resize(nrnmpi_numprocs);

    // fill in the maps
    for (int i = 0; i < nrnmpi_numprocs; ++i)
        if (i != nrnmpi_myid) {
            sbuf = rbuf + i * (ngidmax + 1);
            ngid = *(sbuf++);
            for (int k = 0; k < ngid; ++k) {
                auto gid2in_it = gid2in.find(int(sbuf[k]));
                if (gid2in_it != gid2in.end()) {
                    localmaps[i][k] = gid2in_it->second;
                }
            }
        }

    // cleanup
    delete[] rbuf;
}

#endif  // NRNMPI

// may stimulate a gid for a cell not owned by this cpu. This allows
// us to run single cells or subnets and stimulate exactly according to
// their input in a full parallel net simulation.
// For some purposes, it may be useful to simulate a spike from a
// cell that does exist and would normally send its own spike, eg.
// recurrent stimulation. This can be useful in debugging where the
// spike raster comes from another implementation and one wants to
// get complete control of all input spikes without the confounding
// effects of output spikes from the simulated cells. In this case
// set the third arg to 1 and set the output cell thresholds very
// high so that they do not themselves generate spikes.
// Can only be called by thread 0 because of the ps->send.
void nrn_fake_fire(int gid, double spiketime, int fake_out) {
    auto gid2in_it = gid2in.find(gid);
    if (gid2in_it != gid2in.end()) {
        InputPreSyn* psi = gid2in_it->second;
        assert(psi);
        // printf("nrn_fake_fire %d %g\n", gid, spiketime);
        psi->send(spiketime, net_cvode_instance, nrn_threads);
    } else if (fake_out) {
        std::map<int, PreSyn*>::iterator gid2out_it;
        gid2out_it = gid2out.find(gid);
        if (gid2out_it != gid2out.end()) {
            PreSyn* ps = gid2out_it->second;
            assert(ps);
            // printf("nrn_fake_fire fake_out %d %g\n", gid, spiketime);
            ps->send(spiketime, net_cvode_instance, nrn_threads);
        }
    }
}

static int timeout_ = 0;
int nrn_set_timeout(int timeout) {
    int tt = timeout_;
    timeout_ = timeout;
    return tt;
}

void BBS_netpar_solve(double tstop) {
    double time = nrn_wtime();

#if NRNMPI
    if (corenrn_param.mpi_enable) {
        tstopunset;
        double mt = dt;
        double md = mindelay_ - 1e-10;
        if (md < mt) {
            if (nrnmpi_myid == 0) {
                hoc_execerror("mindelay is 0", "(or less than dt for fixed step method)");
            } else {
                return;
            }
        }

        nrn_timeout(timeout_);
        nrn_multithread_job(interthread_enqueue);
        ncs2nrn_integrate(tstop * (1. + 1e-11));
        nrn_spike_exchange(nrn_threads);
        nrn_timeout(0);
        if (!npe_.empty()) {
            npe_[0].wx_ = npe_[0].ws_ = 0.;
        };
        // printf("%d netpar_solve exit t=%g tstop=%g mindelay_=%g\n",nrnmpi_myid, t, tstop,
        // mindelay_);
        nrnmpi_barrier();
    } else
#endif
    {
        ncs2nrn_integrate(tstop);
    }
    tstopunset;

    if (nrnmpi_myid == 0 && !corenrn_param.is_quiet()) {
        printf("\nSolver Time : %g\n", nrn_wtime() - time);
    }
}

double set_mindelay(double maxdelay) {
    double mindelay = maxdelay;
    last_maxstep_arg_ = maxdelay;

    // if all==1 then minimum delay of all NetCon no matter the source.
    // except if src in same thread as NetCon
    int all = (nrn_nthread > 1);
    // minumum delay of all NetCon having an InputPreSyn source

    /** we have removed nt_ from PreSyn. Build local map of PreSyn
     *  and NrnThread which will be used to find out if src in same thread as NetCon */
    std::map<PreSyn*, NrnThread*> presynmap;

    for (int ith = 0; ith < nrn_nthread; ++ith) {
        NrnThread& nt = nrn_threads[ith];
        for (int i = 0; i < nt.n_presyn; ++i) {
            presynmap[nt.presyns + i] = nrn_threads + ith;
        }
    }

    for (int ith = 0; ith < nrn_nthread; ++ith) {
        NrnThread& nt = nrn_threads[ith];
        // if single thread or file transfer then definitely empty.
        std::vector<int>& negsrcgid_tid = nrnthreads_netcon_negsrcgid_tid[ith];
        size_t i_tid = 0;
        for (int i = 0; i < nt.n_netcon; ++i) {
            NetCon* nc = nt.netcons + i;
            bool chk = false;  // ignore nc.delay_
            int gid = nrnthreads_netcon_srcgid[ith][i];
            int tid = ith;
            if (!negsrcgid_tid.empty() && gid < -1) {
                tid = negsrcgid_tid[i_tid++];
            }
            PreSyn* ps;
            InputPreSyn* psi;
            netpar_tid_gid2ps(tid, gid, &ps, &psi);
            if (psi) {
                chk = true;
            } else if (all) {
                chk = true;
                // but ignore if src in same thread as NetCon
                if (ps && presynmap[ps] == &nt) {
                    chk = false;
                }
            }
            if (chk && nc->delay_ < mindelay) {
                mindelay = nc->delay_;
            }
        }
    }

#if NRNMPI
    if (corenrn_param.mpi_enable) {
        active_ = true;
        if (use_compress_) {
            if (mindelay / dt > 255) {
                mindelay = 255 * dt;
            }
        }

        // printf("%d netpar_mindelay local %g now calling nrnmpi_mindelay\n", nrnmpi_myid,
        // mindelay);
        //	double st = time();
        mindelay_ = nrnmpi_dbl_allmin(mindelay);
        //	add_wait_time(st);
        // printf("%d local min=%g  global min=%g\n", nrnmpi_myid, mindelay, mindelay_);
        errno = 0;
    } else
#endif  // NRNMPI
    {
        mindelay_ = mindelay;
    }
    return mindelay_;
}

/*  08-Nov-2010
The workhorse for spike exchange on up to 10K machines is MPI_Allgather
but as the number of machines becomes far greater than the fanout per
cell we have been exploring a class of exchange methods called multisend
where the spikes only go to those machines that need them and there is
overlap between communication and computation.  The numer of variants of
multisend has grown so that some method selection function is needed
that makes sense.

The situation that needs to be captured by xchng_meth is

Allgather
multisend implemented as MPI_ISend
multisend DCMF (only for Blue Gene/P)
multisend record_replay (only for Blue Gene/P with recordreplay_v1r4m2.patch)

Note that Allgather allows spike compression and an allgather spike buffer
 with size chosen at setup time.  All methods allow bin queueing.

All the multisend methods should allow two phase multisend.

Note that, in principle, MPI_ISend allows the source to send the index
 of the target PreSyn to avoid a hash table lookup (even with a two phase
 variant)

RecordReplay should be best on the BG/P. The whole point is to make the
spike transfer initiation as lowcost as possible since that is what causes
most load imbalance. I.e. since 10K more spikes arrive than are sent, spikes
received per processor per interval are much more statistically
balanced than spikes sent per processor per interval. And presently
DCMF multisend injects 10000 messages per spike into the network which
is quite expensive. record replay avoids this overhead and the idea of
two phase multisend distributes the injection.
*/

int nrnmpi_spike_compress(int nspike, bool gid_compress, int xchng_meth) {
#if NRNMPI
    if (corenrn_param.mpi_enable) {
#if NRN_MULTISEND
        if (xchng_meth > 0) {
            use_multisend_ = 1;
            return 0;
        }
#endif
        nrn_assert(xchng_meth == 0);
        if (nspike >= 0) {
            ag_send_nspike = 0;
            if (spikeout_fixed) {
                free(spikeout_fixed);
                spikeout_fixed = nullptr;
            }
            if (spikein_fixed) {
                free(spikein_fixed);
                spikein_fixed = nullptr;
            }
            if (spfixin_ovfl_) {
                free(spfixin_ovfl_);
                spfixin_ovfl_ = nullptr;
            }
            localmaps.clear();
        }
        if (nspike == 0) {  // turn off
            use_compress_ = false;
            nrn_use_localgid_ = false;
        } else if (nspike > 0) {  // turn on
            use_compress_ = true;
            ag_send_nspike = nspike;
            nrn_use_localgid_ = false;
            if (gid_compress) {
                // we can only do this after everything is set up
                mk_localgid_rep();
                if (!nrn_use_localgid_ && nrnmpi_myid == 0) {
                    printf(
                        "Notice: gid compression did not succeed. Probably more than 255 cells on "
                        "one "
                        "cpu.\n");
                }
            }
            if (!nrn_use_localgid_) {
                localgid_size_ = sizeof(unsigned int);
            }
            ag_send_size = 2 + ag_send_nspike * (1 + localgid_size_);
            spfixout_capacity_ = ag_send_size + 50 * (1 + localgid_size_);
            spikeout_fixed = (unsigned char*) emalloc(spfixout_capacity_);
            spikein_fixed = (unsigned char*) emalloc(nrnmpi_numprocs * ag_send_size);
            ovfl_capacity = 100;
            spfixin_ovfl_ = (unsigned char*) emalloc(ovfl_capacity * (1 + localgid_size_));
        }
        return ag_send_nspike;
    } else
#endif
    {
        return 0;
    }
}
}  // namespace coreneuron


================================================
FILE: coreneuron/network/netpar.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#pragma once

#include "coreneuron/network/partrans.hpp"
#include "coreneuron/sim/multicore.hpp"

namespace coreneuron {

extern void nrn_spike_exchange_init(void);
extern void nrn_spike_exchange(NrnThread* nt);
}  // namespace coreneuron


================================================
FILE: coreneuron/network/partrans.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#include "coreneuron/nrnconf.h"
#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/mpi/nrnmpi.h"
#include "coreneuron/mpi/core/nrnmpi.hpp"
#include "coreneuron/network/partrans.hpp"
#include "coreneuron/apps/corenrn_parameters.hpp"

// This is the computational code for src->target transfer (e.g. gap junction)
// simulation.
// The setup code is in partrans_setup.cpp

namespace coreneuron {
bool nrn_have_gaps;

using namespace nrn_partrans;

TransferThreadData* nrn_partrans::transfer_thread_data_;

// MPI_Alltoallv buffer info
double* nrn_partrans::insrc_buf_;   // Receive buffer for gap voltages
double* nrn_partrans::outsrc_buf_;  // Send buffer for gap voltages
int* nrn_partrans::insrccnt_;
int* nrn_partrans::insrcdspl_;
int* nrn_partrans::outsrccnt_;
int* nrn_partrans::outsrcdspl_;

void nrnmpi_v_transfer() {
    // copy source values to outsrc_buf_ and mpi transfer to insrc_buf

    // note that same source value (usually voltage) may get copied to
    // several locations in outsrc_buf

    // gather the source values. can be done in parallel
    for (int tid = 0; tid < nrn_nthread; ++tid) {
        auto& ttd = transfer_thread_data_[tid];
        auto* nt = &nrn_threads[tid];
        int n = int(ttd.outsrc_indices.size());
        if (n == 0) {
            continue;
        }
        double* src_data = nt->_data;
        int* src_indices = ttd.src_indices.data();

        // gather sources on gpu and copy to cpu, cpu scatters to outsrc_buf
        double* src_gather = ttd.src_gather.data();
        size_t n_src_gather = ttd.src_gather.size();

        nrn_pragma_acc(parallel loop present(src_indices [0:n_src_gather],
                                             src_data [0:nt->_ndata],
                                             src_gather [0:n_src_gather]) if (nt->compute_gpu)
                           async(nt->stream_id))
        nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
        for (std::size_t i = 0; i < n_src_gather; ++i) {
            src_gather[i] = src_data[src_indices[i]];
        }
        nrn_pragma_acc(update host(src_gather [0:n_src_gather]) if (nt->compute_gpu)
                           async(nt->stream_id))
        nrn_pragma_omp(target update from(src_gather [0:n_src_gather]) if (nt->compute_gpu))
    }

    // copy gathered source values to outsrc_buf_
    bool compute_gpu = false;
    for (int tid = 0; tid < nrn_nthread; ++tid) {
        if (nrn_threads[tid].compute_gpu) {
            compute_gpu = true;
            nrn_pragma_acc(wait(nrn_threads[tid].stream_id))
        }
        TransferThreadData& ttd = transfer_thread_data_[tid];
        size_t n_outsrc_indices = ttd.outsrc_indices.size();
        int* outsrc_indices = ttd.outsrc_indices.data();
        double* src_gather = ttd.src_gather.data();
        int* src_gather_indices = ttd.gather2outsrc_indices.data();
        for (size_t i = 0; i < n_outsrc_indices; ++i) {
            outsrc_buf_[outsrc_indices[i]] = src_gather[src_gather_indices[i]];
        }
    }
    static_cast<void>(compute_gpu);

    // transfer
    int n_insrc_buf = insrcdspl_[nrnmpi_numprocs];
#if NRNMPI
    if (corenrn_param.mpi_enable) {  // otherwise insrc_buf_ == outsrc_buf_
        nrnmpi_barrier();
        nrnmpi_dbl_alltoallv(
            outsrc_buf_, outsrccnt_, outsrcdspl_, insrc_buf_, insrccnt_, insrcdspl_);
    } else
#endif
    {  // Use the multiprocess code even for one process to aid debugging
        // For nrnmpi_numprocs == 1, insrc_buf_ and outsrc_buf_ are same size.
        for (int i = 0; i < n_insrc_buf; ++i) {
            insrc_buf_[i] = outsrc_buf_[i];
        }
    }

    // insrc_buf_ will get copied to targets via nrnthread_v_transfer
    nrn_pragma_acc(update device(insrc_buf_ [0:n_insrc_buf]) if (compute_gpu))
    nrn_pragma_omp(target update to(insrc_buf_ [0:n_insrc_buf]) if (compute_gpu))
}

void nrnthread_v_transfer(NrnThread* _nt) {
    // Copy insrc_buf_ values to the target locations. (An insrc_buf_ value
    // may be copied to several target locations.
    TransferThreadData& ttd = transfer_thread_data_[_nt->id];
    size_t ntar = ttd.tar_indices.size();
    int* tar_indices = ttd.tar_indices.data();
    int* insrc_indices = ttd.insrc_indices.data();
    double* tar_data = _nt->_data;
    // last element in the displacement vector gives total length
#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
    defined(_OPENACC)
    int n_insrc_buf = insrcdspl_[nrnmpi_numprocs];
    int ndata = _nt->_ndata;
#endif
    nrn_pragma_acc(parallel loop copyin(tar_indices [0:ntar])
                       present(insrc_indices [0:ntar],
                               tar_data [0:ndata],
                               insrc_buf_ [0:n_insrc_buf]) if (_nt->compute_gpu)
                           async(_nt->stream_id))
    nrn_pragma_omp(target teams distribute parallel for simd map(to: tar_indices[0:ntar]) if(_nt->compute_gpu))
    for (size_t i = 0; i < ntar; ++i) {
        tar_data[tar_indices[i]] = insrc_buf_[insrc_indices[i]];
    }
}

void nrn_partrans::copy_gap_indices_to_device() {
    // Ensure index vectors, src_gather, and insrc_buf_ are on the gpu.
    if (insrcdspl_) {
        // TODO: we don't actually need to copy here, just allocate + associate
        // storage on the device
        cnrn_target_copyin(insrc_buf_, insrcdspl_[nrnmpi_numprocs]);
    }
    for (int tid = 0; tid < nrn_nthread; ++tid) {
        const NrnThread* nt = nrn_threads + tid;
        if (!nt->compute_gpu) {
            continue;
        }

        const TransferThreadData& ttd = transfer_thread_data_[tid];

        if (!ttd.src_indices.empty()) {
            cnrn_target_copyin(ttd.src_indices.data(), ttd.src_indices.size());
            // TODO: we don't actually need to copy here, just allocate +
            // associate storage on the device.
            cnrn_target_copyin(ttd.src_gather.data(), ttd.src_gather.size());
        }

        if (ttd.insrc_indices.size()) {
            cnrn_target_copyin(ttd.insrc_indices.data(), ttd.insrc_indices.size());
        }
    }
}

void nrn_partrans::delete_gap_indices_from_device() {
    if (insrcdspl_) {
        int n_insrc_buf = insrcdspl_[nrnmpi_numprocs];
        cnrn_target_delete(insrc_buf_, n_insrc_buf);
    }
    for (int tid = 0; tid < nrn_nthread; ++tid) {
        const NrnThread* nt = nrn_threads + tid;
        if (!nt->compute_gpu) {
            continue;
        }

        TransferThreadData& ttd = transfer_thread_data_[tid];

        if (!ttd.src_indices.empty()) {
            cnrn_target_delete(ttd.src_indices.data(), ttd.src_indices.size());
            cnrn_target_delete(ttd.src_gather.data(), ttd.src_gather.size());
        }

        if (!ttd.insrc_indices.empty()) {
            cnrn_target_delete(ttd.insrc_indices.data(), ttd.insrc_indices.size());
        }
    }
}
}  // namespace coreneuron


================================================
FILE: coreneuron/network/partrans.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#pragma once

#include "coreneuron/sim/multicore.hpp"

#ifndef NRNLONGSGID
#define NRNLONGSGID 0
#endif

#if NRNLONGSGID
using sgid_t = int64_t;
#else
using sgid_t = int;
#endif

namespace coreneuron {
struct Memb_list;

extern bool nrn_have_gaps;
extern void nrnmpi_v_transfer();
extern void nrnthread_v_transfer(NrnThread*);

namespace nrn_partrans {

/** The basic problem is to copy sources to targets.
 *  It may be the case that a source gets copied to several targets.
 *  Sources and targets are a set of indices in NrnThread.data.
 *  A copy may be intrathread, interthread, interprocess.
 *  Copies happen every time step so efficiency is desirable.
 *  SetupTransferInfo gives us the source and target (sid, type, index) triples
 *  for a thread and all the global threads define what gets copied where.
 *  Need to process that info into TransferThreadData for each thread and
 *  the interprocessor mpi buffers insrc_buf_ and outsrc_buf transfered with
 *  MPI_Alltoallv, hopefully with a more or less optimal ordering.
 *  The compute strategy is: 1) Each thread copies its NrnThread.data source
 *  items to outsrc_buf_. 2) MPI_Allgatherv transfers outsrc_buf_ to insrc_buf_.
 *  3) Each thread, copies insrc_buf_ values to Nrnthread.data target.
 *
 *  Optimal ordering is probably beyond our reach but a few considerations
 *  may be useful. The typical use is for gap junctions where only voltage
 *  transferred and all instances of the HalfGap Point_process receive a
 *  voltage. Two situations are common. Voltage transfer is sparse and one
 *  to one, i.e many compartments do not have gap junctions, and those that do
 *  have only one. The other situation is that all compartments have gap
 *  junctions (e.g. syncytium of single compartment cells in the heart) and
 *  the voltage needs to be transferred to all neighboring cells (e.g. 6-18
 *  cells can be neighbors to the central cell). So on the target side, it
 *  might be good to copy to the target in target index order from the
 *  input_buf_. And on the source side, it is certainly simple to scatter
 *  to the outbut_buf_ in NrnThread.data order.  Note that one expects a wide
 *  scatter to the outsrc_buf and also a wide scatter within the insrc_buf_.
 **/

/*
 * In partrans.cpp: nrnmpi_v_transfer
 *   Copy NrnThead.data to outsrc_buf_ for all threads via
 *     gpu: gather src_gather[i] = NrnThread._data[src_indices[i]];
 *     gpu to host src_gather
 *     cpu: outsrc_buf_[outsrc_indices[i]] = src_gather[gather2outsrc_indices[i]];
 *
 *   MPI_Allgatherv outsrc_buf_ to insrc_buf_
 *
 *   host to gpu insrc_buf_
 *
 * In partrans.cpp: nrnthread_v_transfer
 *   insrc_buf_ to NrnThread._data via
 *   NrnThread.data[tar_indices[i]] = insrc_buf_[insrc_indices[i]];
 *     where tar_indices depends on layout, type, etc.
 */

struct TransferThreadData {
    std::vector<int> src_indices;            // indices into NrnThread._data
    std::vector<double> src_gather;          // copy of NrnThread._data[src_indices]
    std::vector<int> gather2outsrc_indices;  // ix of src_gather that send into outsrc_indices
    std::vector<int> outsrc_indices;         // ix of outsrc_buf that receive src_gather values

    std::vector<int> insrc_indices;  // insrc_buf_ indices copied to ...
    std::vector<int> tar_indices;    // indices of NrnThread.data.
};
extern TransferThreadData* transfer_thread_data_; /* array for threads */

}  // namespace nrn_partrans
}  // namespace coreneuron

// For direct transfer,
// must be same as corresponding struct SetupTransferInfo in NEURON
struct SetupTransferInfo {
    std::vector<sgid_t> src_sid;
    std::vector<int> src_type;
    std::vector<int> src_index;
    std::vector<sgid_t> tar_sid;
    std::vector<int> tar_type;
    std::vector<int> tar_index;
};

namespace coreneuron {
namespace nrn_partrans {

extern SetupTransferInfo* setup_info_; /* array for threads exists only during setup*/

extern void gap_mpi_setup(int ngroup);
extern void gap_data_indices_setup(NrnThread* nt);
extern void copy_gap_indices_to_device();
extern void delete_gap_indices_from_device();
extern void gap_cleanup();

extern double* insrc_buf_;   // Receive buffer for gap voltages
extern double* outsrc_buf_;  // Send buffer for gap voltages
extern int *insrccnt_, *insrcdspl_, *outsrccnt_, *outsrcdspl_;
}  // namespace nrn_partrans
}  // namespace coreneuron


================================================
FILE: coreneuron/network/partrans_setup.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#include <map>
#include <vector>

#include "coreneuron/coreneuron.hpp"
#include "coreneuron/nrnconf.h"
#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/mpi/nrnmpi.h"
#include "coreneuron/mpi/core/nrnmpi.hpp"
#include "coreneuron/network/partrans.hpp"
#include "coreneuron/nrniv/nrniv_decl.h"

namespace coreneuron {
using namespace coreneuron::nrn_partrans;

SetupTransferInfo* nrn_partrans::setup_info_;

class SidInfo {
  public:
    std::vector<int> tids_;
    std::vector<int> indices_;
};

}  // namespace coreneuron
#if NRNLONGSGID
#define sgid_alltoallv nrnmpi_long_alltoallv
#else
#define sgid_alltoallv nrnmpi_int_alltoallv
#endif

#define HAVEWANT_t         sgid_t
#define HAVEWANT_alltoallv sgid_alltoallv
#define HAVEWANT2Int       std::map<sgid_t, int>
#include "coreneuron/network/have2want.h"

namespace coreneuron {
using namespace coreneuron::nrn_partrans;

void nrn_partrans::gap_mpi_setup(int ngroup) {
    // printf("%d gap_mpi_setup ngroup=%d\n", nrnmpi_myid, ngroup);

    // count total_nsrc, total_ntar and allocate.
    // Possible either or both are 0 on this process.
    size_t total_nsrc = 0, total_ntar = 0;
    for (int tid = 0; tid < ngroup; ++tid) {
        auto& si = setup_info_[tid];
        total_nsrc += si.src_sid.size();
        total_ntar += si.tar_sid.size();
    }

    // have and want arrays (add 1 to guarantee new ... is an array.)
    sgid_t* have = new sgid_t[total_nsrc + 1];
    sgid_t* want = new sgid_t[total_ntar + 1];

    // map from source sid to (tid, index), ie.  NrnThread[tid]._data[index].
    // and target sid to lists of (tid, index) for memb_list
    // also count the map sizes and fill have and want arrays
    std::map<sgid_t, SidInfo> src2info;
    std::map<sgid_t, SidInfo> tar2info;

    int src2info_size = 0, tar2info_size = 0;  // number of unique sids
    for (int tid = 0; tid < ngroup; ++tid) {
        auto& si = setup_info_[tid];
        // Sgid has unique source.

        for (size_t i = 0; i < si.src_sid.size(); ++i) {
            sgid_t sid = si.src_sid[i];
            SidInfo sidinfo;
            sidinfo.tids_.push_back(tid);
            sidinfo.indices_.push_back(i);
            src2info[sid] = sidinfo;
            have[src2info_size] = sid;
            src2info_size++;
        }
        // Possibly many targets of same sid
        // Only want unique sids. From each, can obtain all its targets.
        for (size_t i = 0; i < si.tar_sid.size(); ++i) {
            sgid_t sid = si.tar_sid[i];
            if (tar2info.find(sid) == tar2info.end()) {
                tar2info[sid] = SidInfo();
                want[tar2info_size] = sid;
                tar2info_size++;
            }
            SidInfo& sidinfo = tar2info[sid];
            sidinfo.tids_.push_back(tid);
            sidinfo.indices_.push_back(i);
        }
    }

    // 2) Call the have_to_want function.
    sgid_t* send_to_want;
    sgid_t* recv_from_have;

    have_to_want(have,
                 src2info_size,
                 want,
                 tar2info_size,
                 send_to_want,
                 outsrccnt_,
                 outsrcdspl_,
                 recv_from_have,
                 insrccnt_,
                 insrcdspl_,
                 default_rendezvous);

    int nhost = nrnmpi_numprocs;

    // sanity check. all the sgids we are asked to send, we actually have
    for (int i = 0; i < outsrcdspl_[nhost]; ++i) {
        sgid_t sgid = send_to_want[i];
        assert(src2info.find(sgid) != src2info.end());
    }

    // sanity check. all the sgids we receive, we actually need.
    for (int i = 0; i < insrcdspl_[nhost]; ++i) {
        sgid_t sgid = recv_from_have[i];
        assert(tar2info.find(sgid) != tar2info.end());
    }

#if CORENRN_DEBUG
    printf("%d mpi outsrccnt_, outsrcdspl_, insrccnt, insrcdspl_\n", nrnmpi_myid);
    for (int i = 0; i < nrnmpi_numprocs; ++i) {
        printf("%d : %d %d %d %d\n",
               nrnmpi_myid,
               outsrccnt_[i],
               outsrcdspl_[i],
               insrccnt_[i],
               insrcdspl_[i]);
    }
#endif

    // clean up a little
    delete[] have;
    delete[] want;

    insrc_buf_ = new double[insrcdspl_[nhost]];
    outsrc_buf_ = new double[outsrcdspl_[nhost]];

    // for i: src_gather[i] = NrnThread._data[src_indices[i]]
    // for j: outsrc_buf[outsrc_indices[j]] = src_gather[gather2outsrc_indices[j]]
    // src_indices point into NrnThread._data
    // Many outsrc_indices elements can point to the same src_gather element
    // but only if an sgid src datum is destined for multiple ranks.
    for (int i = 0; i < outsrcdspl_[nhost]; ++i) {
        sgid_t sgid = send_to_want[i];
        SidInfo& sidinfo = src2info[sgid];
        // only one item in the lists.
        int tid = sidinfo.tids_[0];
        int setup_info_index = sidinfo.indices_[0];

        auto& si = setup_info_[tid];
        auto& ttd = transfer_thread_data_[tid];

        // Note that src_index points into NrnThread.data, as it has already
        // been transformed using original src_type and src_index via
        // stdindex2ptr.
        // For copying into outsrc_buf from src_gather. This is from
        // NrnThread._data, fixup to "from src_gather" below.
        ttd.gather2outsrc_indices.push_back(si.src_index[setup_info_index]);
        ttd.outsrc_indices.push_back(i);
    }

    // Need to know src_gather index given NrnThread._data index
    // to compute gather2outsrc_indices. And the update outsrc_indices so that
    // for a given thread
    // for j: outsrc_buf[outsrc_indices[j]] = src_gather[gather2outsrc_indices[j]]
    for (int tid = 0; tid < ngroup; ++tid) {
        auto& ttd = transfer_thread_data_[tid];
        std::map<int, int> data2gather_indices;
        for (size_t i = 0; i < ttd.src_indices.size(); ++i) {
            data2gather_indices[ttd.src_indices[i]] = i;
        }

        for (size_t i = 0; i < ttd.outsrc_indices.size(); ++i) {
            ttd.gather2outsrc_indices[i] = data2gather_indices[ttd.gather2outsrc_indices[i]];
        }
    }

    // Which insrc_indices point into which NrnThread.data
    // An sgid occurs at most once in the process recv_from_have.
    // But it might get distributed to more than one thread and to
    // several targets in a thread (specified by tar2info)
    // insrc_indices is parallel to tar_indices and has size ntar of the thread.
    // insrc_indices[i] is the index into insrc_buf
    // tar_indices[i] is the index into NrnThread.data
    // i.e. NrnThead._data[tar_indices[i]] = insrc_buf[insrc_indices[i]]
    for (int i = 0; i < insrcdspl_[nhost]; ++i) {
        sgid_t sgid = recv_from_have[i];
        SidInfo& sidinfo = tar2info[sgid];
        // there may be several items in the lists.
        for (size_t j = 0; j < sidinfo.tids_.size(); ++j) {
            int tid = sidinfo.tids_[j];
            int index = sidinfo.indices_[j];

            transfer_thread_data_[tid].insrc_indices[index] = i;
        }
    }

#if CORENRN_DEBUG
    // things look ok so far?
    for (int tid = 0; tid < ngroup; ++tid) {
        SetupTransferInfo& si = setup_info_[tid];
        nrn_partrans::TransferThreadData& ttd = transfer_thread_data_[tid];
        for (size_t i = 0; i < si.src_sid.size(); ++i) {
            printf("%d %d src sid=%d v_index=%d %g\n",
                   nrnmpi_myid,
                   tid,
                   si.src_sid[i],
                   ttd.src_indices[i],
                   nrn_threads[tid]._data[ttd.src_indices[i]]);
        }
        for (size_t i = 0; i < ttd.tar_indices.size(); ++i) {
            printf("%d %d src sid=i%zd tar_index=%d %g\n",
                   nrnmpi_myid,
                   tid,
                   i,
                   ttd.tar_indices[i],
                   nrn_threads[tid]._data[ttd.tar_indices[i]]);
        }
    }
#endif

    delete[] send_to_want;
    delete[] recv_from_have;
}

/**
 *  For now, until conceptualization of the ordering is clear,
 *  just replace src setup_info_ indices values with stdindex2ptr determined
 *  index into NrnThread._data
 **/
void nrn_partrans::gap_data_indices_setup(NrnThread* n) {
    NrnThread& nt = *n;
    auto& ttd = transfer_thread_data_[nt.id];
    auto& sti = setup_info_[nt.id];

    ttd.src_gather.resize(sti.src_sid.size());
    ttd.src_indices.resize(sti.src_sid.size());
    ttd.insrc_indices.resize(sti.tar_sid.size());
    ttd.tar_indices.resize(sti.tar_sid.size());

    // For copying into src_gather from NrnThread._data
    for (size_t i = 0; i < sti.src_sid.size(); ++i) {
        double* d = stdindex2ptr(sti.src_type[i], sti.src_index[i], nt);
        sti.src_index[i] = int(d - nt._data);
    }

    // For copying into NrnThread._data from insrc_buf.
    for (size_t i = 0; i < sti.tar_sid.size(); ++i) {
        double* d = stdindex2ptr(sti.tar_type[i], sti.tar_index[i], nt);
        // todo : this should be revisited once nt._data will be broken
        // into mechanism specific data
        sti.tar_index[i] = int(d - nt._data);
    }

    // Here we could reorder sti.src_... according to NrnThread._data index
    // order

    // copy into TransferThreadData
    ttd.src_indices = sti.src_index;
    ttd.tar_indices = sti.tar_index;
}

void nrn_partrans::gap_cleanup() {
    if (transfer_thread_data_) {
        delete[] transfer_thread_data_;
        transfer_thread_data_ = nullptr;
    }
    if (insrc_buf_) {
        delete[] insrc_buf_;
        insrc_buf_ = nullptr;
        delete[] insrccnt_;
        insrccnt_ = nullptr;
        delete[] insrcdspl_;
        insrcdspl_ = nullptr;
        delete[] outsrc_buf_;
        outsrc_buf_ = nullptr;
        delete[] outsrccnt_;
        outsrccnt_ = nullptr;
        delete[] outsrcdspl_;
        outsrcdspl_ = nullptr;
    }
}

}  // namespace coreneuron


================================================
FILE: coreneuron/network/tnode.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#pragma once

#include <vector>

// experiment with ordering strategies for Tree Nodes
namespace coreneuron {
class TNode;

using VecTNode = std::vector<TNode*>;

/**
 * \class TNode
 * \brief TNode is the tree node that represents the tree of the compartments
 */
class TNode {
  public:
    TNode(int ix);
    virtual ~TNode();
    TNode* parent;
    VecTNode children;
    size_t mkhash();  /// Hash algorith that generates a hash based on the hash of the children and
                      /// the number of compartments of the children
    size_t hash;      /// Hash value generated by mkhash
    size_t treesize;  /// Total number of compartments from the current node and below
    size_t nodevec_index;   /// index in nodevec that is set in check()
                            /// In cell permute 2 this is set as Breadth First traversal
    size_t treenode_order;  /// For cell permute 1 (Interleaved):
                            /// - This is the id given to the compartments based on a Breadth First
                            /// access on the tree that is created in the original circuit
                            /// - This is what makes the cell ordering interleaved
                            /// For cell permute 2 (Constant Depth):
                            /// VVVTN: Vector (groups of cells) of vector (levels of this group of
                            /// cells. Maxsize = maxlevel) of vector of TNodes This changes 3 times
                            /// during cell permute 2:
                            /// 1. According to the sorting of the nodes of each level
                            /// 2. According to the sorting of the parents' treenode_order of the
                            /// previous ordering
                            /// 3. According to children and parents data races. Parents and
                            /// children of the tree are moved by question2() so that threads that
                            /// exist on the same warp don't have data races when updating the
                            /// children and parent variables, so that threads have to wait in
                            /// atomic instructions. If there are any races then those are solved by
                            /// atomic instructions.
    size_t level;           /// level of of this compartment in the tree
    size_t cellindex;       /// Cell ID that this compartment belongs to
    size_t groupindex;      /// Initialized index / groupsize
    int nodeindex;
};

size_t level_from_leaf(VecTNode&);
size_t level_from_root(VecTNode&);

/**
 * \brief Implementation of the advanced interleaving strategy (interleave_permute_type == 2)
 *
 * The main steps are the following:
 * 1. warp_balance function creates balanced groups of cells.
 * 2. The compartments/tree nodes populate the groups vector (VVVTN) based on their groudindex and
 * their level (see level_from_root).
 * 3. The analyze() & question2() functions (operating per group) make sure that each cell is still
 * a tree (treenode_order) and that the dependent nodes belong to separate warps.
 */
void group_order2(VecTNode&, size_t groupsize, size_t ncell);
size_t dist2child(TNode* nd);

/**
 * \brief Use of the LPT (Least Processing Time) algorithm to create balanced groups of cells.
 *
 * Competing objectives are to keep identical cells together and also balance warps.
 *
 * \param ncell number of cells
 * \param nodevec vector of compartments from all cells
 * \return number of warps
 */
size_t warp_balance(size_t ncell, VecTNode& nodevec);

#define warpsize 32
}  // namespace coreneuron


================================================
FILE: coreneuron/network/tqueue.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cstdarg>

#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/network/tqueue.hpp"

namespace coreneuron {
// splay tree + bin queue limited to fixed step method
// for event-sets or priority queues
// this starts from the sptqueue.cpp file and adds a bin queue

/* Derived from David Brower's c translation of pascal code by
Douglas Jones.
*/
/* The original c code is included from this file but note that instead
of struct _spblk, we are really using TQItem
*/

BinQ::BinQ() {
    nbin_ = 1000;
    bins_ = new TQItem*[nbin_];
    for (int i = 0; i < nbin_; ++i) {
        bins_[i] = 0;
    }
    qpt_ = 0;
    tt_ = 0.;
}

BinQ::~BinQ() {
    for (int i = 0; i < nbin_; ++i) {
        assert(!bins_[i]);
    }
    delete[] bins_;
    vec_bins.clear();
}

void BinQ::resize(int size) {
    // printf("BinQ::resize from %d to %d\n", nbin_, size);
    assert(size >= nbin_);
    TQItem** bins = new TQItem*[size];
    for (int i = nbin_; i < size; ++i) {
        bins[i] = 0;
    }
    for (int i = 0, j = qpt_; i < nbin_; ++i, ++j) {
        if (j >= nbin_) {
            j = 0;
        }
        bins[i] = bins_[j];
        for (auto q = bins[i]; q; q = q->left_) {
            q->cnt_ = i;
        }
    }
    delete[] bins_;
    bins_ = bins;
    nbin_ = size;
    qpt_ = 0;
}
void BinQ::enqueue(double td, TQItem* q) {
    int idt = (int) ((td - tt_) * rev_dt + 1.e-10);
    assert(idt >= 0);
    if (idt >= nbin_) {
        resize(idt + 1000);
    }
    // assert (idt < nbin_);
    idt += qpt_;
    if (idt >= nbin_) {
        idt -= nbin_;
    }
    // printf("enqueue: idt=%d qpt=%d nbin_=%d\n", idt, qpt_, nbin_);
    assert(idt < nbin_);
    q->cnt_ = idt;  // only for iteration
    q->left_ = bins_[idt];
    bins_[idt] = q;
}
TQItem* BinQ::dequeue() {
    TQItem* q = bins_[qpt_];
    if (q) {
        bins_[qpt_] = q->left_;
    }
    return q;
}

TQItem* BinQ::first() {
    for (int i = 0; i < nbin_; ++i) {
        if (bins_[i]) {
            return bins_[i];
        }
    }
    return 0;
}
TQItem* BinQ::next(TQItem* q) {
    if (q->left_) {
        return q->left_;
    }
    for (int i = q->cnt_ + 1; i < nbin_; ++i) {
        if (bins_[i]) {
            return bins_[i];
        }
    }
    return 0;
}

void BinQ::remove(TQItem* q) {
    TQItem* q1 = bins_[q->cnt_];
    if (q1 == q) {
        bins_[q->cnt_] = q->left_;
        return;
    }
    for (TQItem* q2 = q1->left_; q2; q1 = q2, q2 = q2->left_) {
        if (q2 == q) {
            q1->left_ = q->left_;
            return;
        }
    }
}

//#include "coreneuron/nrniv/sptree.h"

/*
 *  The following code implements the basic operations on
 *  an event-set or priority-queue implemented using splay trees:
 *
Hines changed to void spinit(SPTREE**) for use with TQueue.
 *  SPTREE *spinit( compare )	Make a new tree
 *  SPBLK *spenq( n, q )	Insert n in q after all equal keys.
 *  SPBLK *spdeq( np )		Return first key under *np, removing it.
 *  void splay( n, q )		n (already in q) becomes the root.
 *  int n = sphead( q )         n is the head item in q (not removed).
 *  spdelete( n, q )		n is removed from q.
 *
 *  In the above, n points to an SPBLK type, while q points to an
 *  SPTREE.
 *
 *  The implementation used here is based on the implementation
 *  which was used in the tests of splay trees reported in:
 *
 *    An Empirical Comparison of Priority-Queue and Event-Set Implementations,
 *	by Douglas W. Jones, Comm. ACM 29, 4 (Apr. 1986) 300-311.
 *
 *  The changes made include the addition of the enqprior
 *  operation and the addition of up-links to allow for the splay
 *  operation.  The basic splay tree algorithms were originally
 *  presented in:
 *
 *	Self Adjusting Binary Trees,
 *		by D. D. Sleator and R. E. Tarjan,
 *			Proc. ACM SIGACT Symposium on Theory
 *			of Computing (Boston, Apr 1983) 235-245.
 *
 *  The enq and enqprior routines use variations on the
 *  top-down splay operation, while the splay routine is bottom-up.
 *  All are coded for speed.
 *
 *  Written by:
 *    Douglas W. Jones
 *
 *  Translated to C by:
 *    David Brower, daveb@rtech.uucp
 *
 * Thu Oct  6 12:11:33 PDT 1988 (daveb) Fixed spdeq, which was broken
 *	handling one-node trees.  I botched the pascal translation of
 *	a VAR parameter.
 */

/*----------------
 *
 * spinit() -- initialize an empty splay tree
 *
 */
void spinit(SPTREE* q) {
    q->enqcmps = 0;
    q->root = nullptr;
}

/*----------------
 *
 *  spenq() -- insert item in a tree.
 *
 *  put n in q after all other nodes with the same key; when this is
 *  done, n will be the root of the splay tree representing q, all nodes
 *  in q with keys less than or equal to that of n will be in the
 *  left subtree, all with greater keys will be in the right subtree;
 *  the tree is split into these subtrees from the top down, with rotations
 *  performed along the way to shorten the left branch of the right subtree
 *  and the right branch of the left subtree
 */
SPBLK* spenq(SPBLK* n, SPTREE* q) {
    SPBLK* left;  /* the rightmost node in the left tree */
    SPBLK* right; /* the leftmost node in the right tree */
    SPBLK* next;  /* the root of the unsplit part */
    SPBLK* temp;

    double key;

    n->uplink = nullptr;
    next = q->root;
    q->root = n;
    if (next == nullptr) /* trivial enq */
    {
        n->leftlink = nullptr;
        n->rightlink = nullptr;
    } else /* difficult enq */
    {
        key = n->key;
        left = n;
        right = n;

        /* n's left and right children will hold the right and left
       splayed trees resulting from splitting on n->key;
       note that the children will be reversed! */

        q->enqcmps++;
        if (STRCMP(next->key, key) > 0)
            goto two;

    one: /* assert next->key <= key */

        do /* walk to the right in the left tree */
        {
            temp = next->rightlink;
            if (temp == nullptr) {
                left->rightlink = next;
                next->uplink = left;
                right->leftlink = nullptr;
                goto done; /* job done, entire tree split */
            }

            q->enqcmps++;
            if (STRCMP(temp->key, key) > 0) {
                left->rightlink = next;
                next->uplink = left;
                left = next;
                next = temp;
                goto two; /* change sides */
            }

            next->rightlink = temp->leftlink;
            if (temp->leftlink != nullptr)
                temp->leftlink->uplink = next;
            left->rightlink = temp;
            temp->uplink = left;
            temp->leftlink = next;
            next->uplink = temp;
            left = temp;
            next = temp->rightlink;
            if (next == nullptr) {
                right->leftlink = nullptr;
                goto done; /* job done, entire tree split */
            }

            q->enqcmps++;

        } while (STRCMP(next->key, key) <= 0); /* change sides */

    two: /* assert next->key > key */

        do /* walk to the left in the right tree */
        {
            temp = next->leftlink;
            if (temp == nullptr) {
                right->leftlink = next;
                next->uplink = right;
                left->rightlink = nullptr;
                goto done; /* job done, entire tree split */
            }

            q->enqcmps++;
            if (STRCMP(temp->key, key) <= 0) {
                right->leftlink = next;
                next->uplink = right;
                right = next;
                next = temp;
                goto one; /* change sides */
            }
            next->leftlink = temp->rightlink;
            if (temp->rightlink != nullptr)
                temp->rightlink->uplink = next;
            right->leftlink = temp;
            temp->uplink = right;
            temp->rightlink = next;
            next->uplink = temp;
            right = temp;
            next = temp->leftlink;
            if (next == nullptr) {
                left->rightlink = nullptr;
                goto done; /* job done, entire tree split */
            }

            q->enqcmps++;

        } while (STRCMP(next->key, key) > 0); /* change sides */

        goto one;

    done: /* split is done, branches of n need reversal */

        temp = n->leftlink;
        n->leftlink = n->rightlink;
        n->rightlink = temp;
    }

    return (n);

} /* spenq */

/*----------------
 *
 *  spdeq() -- return and remove head node from a subtree.
 *
 *  remove and return the head node from the node set; this deletes
 *  (and returns) the leftmost node from q, replacing it with its right
 *  subtree (if there is one); on the way to the leftmost node, rotations
 *  are performed to shorten the left branch of the tree
 */
SPBLK* spdeq(SPBLK** np) /* pointer to a node pointer */

{
    SPBLK* deq;        /* one to return */
    SPBLK* next;       /* the next thing to deal with */
    SPBLK* left;       /* the left child of next */
    SPBLK* farleft;    /* the left child of left */
    SPBLK* farfarleft; /* the left child of farleft */

    if (np == nullptr || *np == nullptr) {
        deq = nullptr;
    } else {
        next = *np;
        left = next->leftlink;
        if (left == nullptr) {
            deq = next;
            *np = next->rightlink;

            if (*np != nullptr)
                (*np)->uplink = nullptr;

        } else
            for (;;) /* left is not null */
            {
                /* next is not it, left is not nullptr, might be it */
                farleft = left->leftlink;
                if (farleft == nullptr) {
                    deq = left;
                    next->leftlink = left->rightlink;
                    if (left->rightlink != nullptr)
                        left->rightlink->uplink = next;
                    break;
                }

                /* next, left are not it, farleft is not nullptr, might be it */
                farfarleft = farleft->leftlink;
                if (farfarleft == nullptr) {
                    deq = farleft;
                    left->leftlink = farleft->rightlink;
                    if (farleft->rightlink != nullptr)
                        farleft->rightlink->uplink = left;
                    break;
                }

                /* next, left, farleft are not it, rotate */
                next->leftlink = farleft;
                farleft->uplink = next;
                left->leftlink = farleft->rightlink;
                if (farleft->rightlink != nullptr)
                    farleft->rightlink->uplink = left;
                farleft->rightlink = left;
                left->uplink = farleft;
                next = farleft;
                left = farfarleft;
            }
    }

    return (deq);

} /* spdeq */

/*----------------
 *
 *  splay() -- reorganize the tree.
 *
 *  the tree is reorganized so that n is the root of the
 *  splay tree representing q; results are unpredictable if n is not
 *  in q to start with; q is split from n up to the old root, with all
 *  nodes to the left of n ending up in the left subtree, and all nodes
 *  to the right of n ending up in the right subtree; the left branch of
 *  the right subtree and the right branch of the left subtree are
 *  shortened in the process
 *
 *  this code assumes that n is not nullptr and is in q; it can sometimes
 *  detect n not in q and complain
 */

void splay(SPBLK* n, SPTREE* q) {
    SPBLK* up;     /* points to the node being dealt with */
    SPBLK* prev;   /* a descendent of up, already dealt with */
    SPBLK* upup;   /* the parent of up */
    SPBLK* upupup; /* the grandparent of up */
    SPBLK* left;   /* the top of left subtree being built */
    SPBLK* right;  /* the top of right subtree being built */

    left = n->leftlink;
    right = n->rightlink;
    prev = n;
    up = prev->uplink;

    while (up != nullptr) {
        /* walk up the tree towards the root, splaying all to the left of
       n into the left subtree, all to right into the right subtree */

        upup = up->uplink;
        if (up->leftlink == prev) /* up is to the right of n */
        {
            if (upup != nullptr && upup->leftlink == up) /* rotate */
            {
                upupup = upup->uplink;
                upup->leftlink = up->rightlink;
                if (upup->leftlink != nullptr)
                    upup->leftlink->uplink = upup;
                up->rightlink = upup;
                upup->uplink = up;
                if (upupup == nullptr)
                    q->root = up;
                else if (upupup->leftlink == upup)
                    upupup->leftlink = up;
                else
                    upupup->rightlink = up;
                up->uplink = upupup;
                upup = upupup;
            }
            up->leftlink = right;
            if (right != nullptr)
                right->uplink = up;
            right = up;

        } else /* up is to the left of n */
        {
            if (upup != nullptr && upup->rightlink == up) /* rotate */
            {
                upupup = upup->uplink;
                upup->rightlink = up->leftlink;
                if (upup->rightlink != nullptr)
                    upup->rightlink->uplink = upup;
                up->leftlink = upup;
                upup->uplink = up;
                if (upupup == nullptr)
                    q->root = up;
                else if (upupup->rightlink == upup)
                    upupup->rightlink = up;
                else
                    upupup->leftlink = up;
                up->uplink = upupup;
                upup = upupup;
            }
            up->rightlink = left;
            if (left != nullptr)
                left->uplink = up;
            left = up;
        }
        prev = up;
        up = upup;
    }

#ifdef DEBUG
    if (q->root != prev) {
        /*	fprintf(stderr, " *** bug in splay: n not in q *** " ); */
        abort();
    }
#endif

    n->leftlink = left;
    n->rightlink = right;
    if (left != nullptr)
        left->uplink = n;
    if (right != nullptr)
        right->uplink = n;
    q->root = n;
    n->uplink = nullptr;

} /* splay */

/*----------------
 *
 * sphead() --  return the "lowest" element in the tree.
 *
 *      returns a reference to the head event in the event-set q,
 *      represented as a splay tree; q->root ends up pointing to the head
 *      event, and the old left branch of q is shortened, as if q had
 *      been splayed about the head element; this is done by dequeueing
 *      the head and then making the resulting queue the right son of
 *      the head returned by spdeq; an alternative is provided which
 *      avoids splaying but just searches for and returns a pointer to
 *      the bottom of the left branch
 */
SPBLK* sphead(SPTREE* q) {
    SPBLK* x;

    /* splay version, good amortized bound */
    x = spdeq(&q->root);
    if (x != nullptr) {
        x->rightlink = q->root;
        x->leftlink = nullptr;
        x->uplink = nullptr;
        if (q->root != nullptr)
            q->root->uplink = x;
    }
    q->root = x;

    /* alternative version, bad amortized bound,
       but faster on the average */

    return (x);

} /* sphead */

/*----------------
 *
 * spdelete() -- Delete node from a tree.
 *
 *	n is deleted from q; the resulting splay tree has been splayed
 *	around its new root, which is the successor of n
 *
 */
void spdelete(SPBLK* n, SPTREE* q) {
    SPBLK* x;

    splay(n, q);
    x = spdeq(&q->root->rightlink);
    if (x == nullptr) /* empty right subtree */
    {
        q->root = q->root->leftlink;
        if (q->root)
            q->root->uplink = nullptr;
    } else /* non-empty right subtree */
    {
        x->uplink = nullptr;
        x->leftlink = q->root->leftlink;
        x->rightlink = q->root->rightlink;
        if (x->leftlink != nullptr)
            x->leftlink->uplink = x;
        if (x->rightlink != nullptr)
            x->rightlink->uplink = x;
        q->root = x;
    }

} /* spdelete */
}  // namespace coreneuron


================================================
FILE: coreneuron/network/tqueue.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#pragma once

/*
**  SPTREE:  The following type declarations provide the binary tree
**  representation of event-sets or priority queues needed by splay trees
**
**  assumes that data and datb will be provided by the application
**  to hold all application specific information
**
**  assumes that key will be provided by the application, comparable
**  with the compare function applied to the addresses of two keys.
*/
// bin queue for the fixed step method for NetCons and PreSyns. Splay tree
// for others.
// fifo for the NetCons and PreSyns with same delay. Splay tree for
// others (especially SelfEvents).
// note that most methods below assume a TQItem is in the splay tree
// For the bin part, only insert_fifo, and remove make sense,
// The bin part assumes a fixed step method.

#include <cstdio>
#include <cassert>
#include <queue>
#include <vector>
#include <map>
#include <utility>

namespace coreneuron {
#define STRCMP(a, b) (a - b)

class TQItem;
#define SPBLK     TQItem
#define leftlink  left_
#define rightlink right_
#define uplink    parent_
#define cnt       cnt_
#define key       t_

struct SPTREE {
    SPBLK* root; /* root node */

    /* Statistics, not strictly necessary, but handy for tuning  */
    int enqcmps; /* compares in spenq */
};

#define spinit   sptq_spinit
#define spenq    sptq_spenq
#define spdeq    sptq_spdeq
#define splay    sptq_splay
#define sphead   sptq_sphead
#define spdelete sptq_spdelete

extern void spinit(SPTREE*);           /* init tree */
extern SPBLK* spenq(SPBLK*, SPTREE*);  /* insert item into the tree */
extern SPBLK* spdeq(SPBLK**);          /* return and remove lowest item in subtree */
extern void splay(SPBLK*, SPTREE*);    /* reorganize tree */
extern SPBLK* sphead(SPTREE*);         /* return first node in tree */
extern void spdelete(SPBLK*, SPTREE*); /* delete node from tree */

struct DiscreteEvent;
class TQItem {
  public:
    DiscreteEvent* data_ = nullptr;
    double t_ = 0;
    TQItem* left_ = nullptr;
    TQItem* right_ = nullptr;
    TQItem* parent_ = nullptr;
    int cnt_ = 0;  // reused: -1 means it is in the splay tree, >=0 gives bin
};

using TQPair = std::pair<double, TQItem*>;

struct less_time {
    bool operator()(const TQPair& x, const TQPair& y) const {
        return x.first > y.first;
    }
};

// helper class for the TQueue (SplayTBinQueue).
class BinQ {
  public:
    BinQ();
    ~BinQ();
    void enqueue(double tt, TQItem*);
    void shift(double tt) {
        assert(!bins_[qpt_]);
        tt_ = tt;
        if (++qpt_ >= nbin_) {
            qpt_ = 0;
        }
    }
    TQItem* top() {
        return bins_[qpt_];
    }
    TQItem* dequeue();
    double tbin() {
        return tt_;
    }
    // for iteration
    TQItem* first();
    TQItem* next(TQItem*);
    void remove(TQItem*);
    void resize(int);

  private:
    double tt_;  // time at beginning of qpt_ interval
    int nbin_, qpt_;
    TQItem** bins_;
    std::vector<std::vector<TQItem*>> vec_bins;
};

enum container { spltree, pq_que };

template <container C = spltree>
class TQueue {
  public:
    TQueue();
    ~TQueue();

    inline TQItem* least() {
        return least_;
    }
    inline TQItem* insert(double t, DiscreteEvent* data);
    inline TQItem* enqueue_bin(double t, DiscreteEvent* data);
    inline TQItem* dequeue_bin() {
        return binq_->dequeue();
    }
    inline void shift_bin(double _t_) {
        ++nshift_;
        binq_->shift(_t_);
    }
    inline TQItem* top() {
        return binq_->top();
    }

    inline TQItem* atomic_dq(double til);
    inline void remove(TQItem*);
    inline void move(TQItem*, double tnew);
    int nshift_;

    /// Priority queue of vectors for queuing the events. enqueuing for move() and
    /// move_least_nolock() is not implemented
    std::priority_queue<TQPair, std::vector<TQPair>, less_time> pq_que_;
    /// Types of queuing statistics
    enum qtype { enq = 0, spike, ite, deq };

  private:
    double least_t_nolock() {
        if (least_) {
            return least_->t_;
        } else {
            return 1e15;
        }
    }
    void move_least_nolock(double tnew);
    SPTREE* sptree_;

  public:
    BinQ* binq_;

  private:
    TQItem* least_;
    TQPair make_TQPair(TQItem* p) {
        return TQPair(p->t_, p);
    }
};
}  // namespace coreneuron
#include "coreneuron/network/tqueue.ipp"


================================================
FILE: coreneuron/network/tqueue.ipp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#ifndef tqueue_ipp_
#define tqueue_ipp_

#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cstdarg>

#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/network/tqueue.hpp"

namespace coreneuron {
// splay tree + bin queue limited to fixed step method
// for event-sets or priority queues
// this starts from the sptqueue.cpp file and adds a bin queue

/* Derived from David Brower's c translation of pascal code by
Douglas Jones.
*/
/* The original c code is included from this file but note that instead
of struct _spblk, we are really using TQItem
*/

template <container C>
TQueue<C>::TQueue() {
    nshift_ = 0;
    sptree_ = new SPTREE;
    spinit(sptree_);
    binq_ = new BinQ;
    least_ = 0;
}

template <container C>
TQueue<C>::~TQueue() {
    SPBLK *q, *q2;
    /// Clear the binq
    for (q = binq_->first(); q; q = q2) {
        q2 = binq_->next(q);
        binq_->remove(q);
        delete q;
    }
    delete binq_;

    if (least_) {
        delete least_;
        least_ = nullptr;
    }

    /// Clear the splay tree
    while ((q = spdeq(&sptree_->root)) != nullptr) {
        delete q;
    }
    delete sptree_;

    /// Clear the priority queue
    while (pq_que_.size()) {
        delete pq_que_.top().second;
        pq_que_.pop();
    }
}

template <container C>
TQItem* TQueue<C>::enqueue_bin(double td, DiscreteEvent* d) {
    TQItem* i = new TQItem;
    i->data_ = d;
    i->t_ = td;
    binq_->enqueue(td, i);
    return i;
}

/// Splay tree priority queue implementation
template <>
inline void TQueue<spltree>::move_least_nolock(double tnew) {
    TQItem* b = least();
    if (b) {
        b->t_ = tnew;
        TQItem* nl;
        nl = sphead(sptree_);
        if (nl && (tnew > nl->t_)) {
            least_ = spdeq(&sptree_->root);
            spenq(b, sptree_);
        }
    }
}

/// STL priority queue implementation
template <>
inline void TQueue<pq_que>::move_least_nolock(double tnew) {
    TQItem* b = least();
    if (b) {
        b->t_ = tnew;
        TQItem* nl;
        nl = pq_que_.top().second;
        if (nl && (tnew > nl->t_)) {
            least_ = nl;
            pq_que_.pop();
            pq_que_.push(make_TQPair(b));
        }
    }
}

/// Splay tree priority queue implementation
template <>
inline void TQueue<spltree>::move(TQItem* i, double tnew) {
    if (i == least_) {
        move_least_nolock(tnew);
    } else if (tnew < least_->t_) {
        spdelete(i, sptree_);
        i->t_ = tnew;
        spenq(least_, sptree_);
        least_ = i;
    } else {
        spdelete(i, sptree_);
        i->t_ = tnew;
        spenq(i, sptree_);
    }
}

/// STL priority queue implementation
template <>
inline void TQueue<pq_que>::move(TQItem* i, double tnew) {
    if (i == least_) {
        move_least_nolock(tnew);
    } else if (tnew < least_->t_) {
        TQItem* qmove = new TQItem;
        qmove->data_ = i->data_;
        qmove->t_ = tnew;
        qmove->cnt_ = i->cnt_;
        i->t_ = -1.;
        pq_que_.push(make_TQPair(least_));
        least_ = qmove;
    } else {
        TQItem* qmove = new TQItem;
        qmove->data_ = i->data_;
        qmove->t_ = tnew;
        qmove->cnt_ = i->cnt_;
        i->t_ = -1.;
        pq_que_.push(make_TQPair(qmove));
    }
}

/// Splay tree priority queue implementation
template <>
inline TQItem* TQueue<spltree>::insert(double tt, DiscreteEvent* d) {
    TQItem* i = new TQItem;
    i->data_ = d;
    i->t_ = tt;
    i->cnt_ = -1;
    if (tt < least_t_nolock()) {
        if (least_) {
            /// Probably storing both time and event which has the time is redundant, but the event
            /// is then returned
            /// to the upper level call stack function. If we were to eliminate i->t_ and i->cnt_
            /// fields,
            /// we need to make sure we are not braking anything.
            spenq(least_, sptree_);
        }
        least_ = i;
    } else {
        spenq(i, sptree_);
    }
    return i;
}

/// STL priority queue implementation
template <>
inline TQItem* TQueue<pq_que>::insert(double tt, DiscreteEvent* d) {
    TQItem* i = new TQItem;
    i->data_ = d;
    i->t_ = tt;
    i->cnt_ = -1;
    if (tt < least_t_nolock()) {
        if (least_) {
            /// Probably storing both time and event which has the time is redundant, but the event
            /// is then returned
            /// to the upper level call stack function. If we were to eliminate i->t_ and i->cnt_
            /// fields,
            /// we need to make sure we are not braking anything.
            pq_que_.push(make_TQPair(least_));
        }
        least_ = i;
    } else {
        pq_que_.push(make_TQPair(i));
    }
    return i;
}

/// Splay tree priority queue implementation
template <>
inline void TQueue<spltree>::remove(TQItem* q) {
    if (q) {
        if (q == least_) {
            if (sptree_->root) {
                least_ = spdeq(&sptree_->root);
            } else {
                least_ = nullptr;
            }
        } else {
            spdelete(q, sptree_);
        }
        delete q;
    }
}

/// STL priority queue implementation
template <>
inline void TQueue<pq_que>::remove(TQItem* q) {
    if (q) {
        if (q == least_) {
            if (pq_que_.size()) {
                least_ = pq_que_.top().second;
                pq_que_.pop();
            } else {
                least_ = nullptr;
            }
        } else {
            q->t_ = -1.;
        }
    }
}

/// Splay tree priority queue implementation
template <>
inline TQItem* TQueue<spltree>::atomic_dq(double tt) {
    TQItem* q = nullptr;
    if (least_ && least_->t_ <= tt) {
        q = least_;
        if (sptree_->root) {
            least_ = spdeq(&sptree_->root);
        } else {
            least_ = nullptr;
        }
    }
    return q;
}

/// STL priority queue implementation
template <>
inline TQItem* TQueue<pq_que>::atomic_dq(double tt) {
    TQItem* q = nullptr;
    if (least_ && least_->t_ <= tt) {
        q = least_;
        //        int qsize = pq_que_.size();
        //        printf("map size: %d\n", msize);
        /// This while loop is to delete events whose times have been moved with the ::move
        /// function,
        /// but in fact events were left in the queue since the only function available is pop
        while (pq_que_.size() && pq_que_.top().second->t_ < 0.) {
            delete pq_que_.top().second;
            pq_que_.pop();
        }
        if (pq_que_.size()) {
            least_ = pq_que_.top().second;
            pq_que_.pop();
        } else {
            least_ = nullptr;
        }
    }
    return q;
}
}  // namespace coreneuron
#endif


================================================
FILE: coreneuron/nrnconf.h
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/
#pragma once

#include "coreneuron/config/version_macros.hpp"
#include "coreneuron/utils/offload.hpp"

#include <cstdio>
#include <cmath>
#include <cassert>
#include <cerrno>
#include <cstdint>

namespace coreneuron {

#define NRNBBCORE 1

using Datum = int;
using Pfri = int (*)();
using Symbol = char;

#define VEC_A(i)    (_nt->_actual_a[(i)])
#define VEC_B(i)    (_nt->_actual_b[(i)])
#define VEC_D(i)    (_nt->_actual_d[(i)])
#define VEC_RHS(i)  (_nt->_actual_rhs[(i)])
#define VEC_V(i)    (_nt->_actual_v[(i)])
#define VEC_AREA(i) (_nt->_actual_area[(i)])
#define VECTORIZE   1

extern double celsius;
extern double pi;
extern int secondorder;

extern double t, dt;
extern int rev_dt;
extern bool stoprun;
extern const char* bbcore_write_version;
#define tstopbit   (1 << 15)
#define tstopset   stoprun |= tstopbit
#define tstopunset stoprun &= (~tstopbit)

extern void* nrn_cacheline_alloc(void** memptr, size_t size);
extern void* emalloc_align(size_t size, size_t alignment);
extern void* ecalloc_align(size_t n, size_t size, size_t alignment);
extern void check_bbcore_write_version(const char*);


}  // namespace coreneuron


================================================
FILE: coreneuron/nrniv/nrniv_decl.h
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#pragma once

#include <vector>
#include <map>
#include "coreneuron/network/netcon.hpp"
namespace coreneuron {

/// Mechanism type to be used from stdindex2ptr and nrn_dblpntr2nrncore (in Neuron)
/// Values of the mechanism types should be negative numbers to avoid any conflict with
/// mechanism types of Memb_list(>0) or time(0) passed from Neuron
enum mech_type { voltage = -1, i_membrane_ = -2 };

extern bool cvode_active_;
/// Vector of maps for negative presyns
extern std::vector<std::map<int, PreSyn*>> neg_gid2out;
/// Maps for ouput and input presyns
extern std::map<int, PreSyn*> gid2out;
extern std::map<int, InputPreSyn*> gid2in;

/// InputPreSyn.nc_index_ to + InputPreSyn.nc_cnt_ give the NetCon*
extern std::vector<NetCon*> netcon_in_presyn_order_;
/// Only for setup vector of netcon source gids and mindelay determination
extern std::vector<int*> nrnthreads_netcon_srcgid;
/// Companion to nrnthreads_netcon_srcgid when src gid is negative to allow
/// determination of the NrnThread of the source PreSyn.
extern std::vector<std::vector<int>> nrnthreads_netcon_negsrcgid_tid;

extern void mk_mech(const char* path);
extern void set_globals(const char* path, bool cli_global_seed, int cli_global_seed_value);
extern void mk_netcvode(void);
extern void nrn_p_construct(void);
extern double* stdindex2ptr(int mtype, int index, NrnThread&);
extern void delete_trajectory_requests(NrnThread&);
extern void nrn_cleanup();
extern void nrn_cleanup_ion_map();
extern void BBS_netpar_solve(double);
extern void nrn_mkPatternStim(const char* filename, double tstop);
extern int nrn_extra_thread0_vdata;
extern void nrn_set_extra_thread0_vdata(void);
extern Point_process* nrn_artcell_instantiate(const char* mechname);
extern int nrnmpi_spike_compress(int nspike, bool gidcompress, int xchng);
extern bool nrn_use_bin_queue_;

extern void nrn_outputevent(unsigned char, double);
extern void ncs2nrn_integrate(double tstop);

extern void handle_forward_skip(double forwardskip, int prcellgid);

extern int nrn_set_timeout(int);
extern void nrn_fake_fire(int gid, double spiketime, int fake_out);

extern void netpar_tid_gid2ps(int tid, int gid, PreSyn** ps, InputPreSyn** psi);
extern double set_mindelay(double maxdelay);

extern int nrn_soa_padded_size(int cnt, int layout);

extern int interleave_permute_type;
extern int cellorder_nwarp;

// Mechanism pdata index values into _actual_v and _actual_area data need to be updated.
enum Layout { SoA = 0, AoS = 1 };
}  // namespace coreneuron


================================================
FILE: coreneuron/nrnoc/md1redef.h
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#pragma once

#define v        _v
#define area     _area
#define thisnode _thisnode
#define GC       _GC
#define EC       _EC
#define extnode  _extnode
#define xain     _xain
#define xbout    _xbout
#define i        _i
#define sec      _sec

#undef Memb_list
#undef nodelist
#undef nodeindices
#undef data
#undef pdata
#undef prop
#undef nodecount
#undef pval
#undef id
#undef weights
#undef weight_index_

#define nodelist      _nodelist
#define nodeindices   _nodeindices
#define data          _data
#define pdata         _pdata
#define prop          _prop
#define nodecount     _nodecount
#define pval          _pval
#define id            _id
#define weights       _weights
#define weight_index_ _weight_index


================================================
FILE: coreneuron/nrnoc/md2redef.h
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#pragma once

#undef v
#undef area
#undef thisnode
#undef GC
#undef EC
#undef extnode
#undef xain
#undef xbout
#undef i
#undef sec

#undef NrnThread
#undef Memb_list
#undef nodelist
#undef nodeindices
#undef data
#undef pdata
#undef prop
#undef nodecount
#undef pval
#undef weights
#undef weight_index_

#undef id


================================================
FILE: coreneuron/permute/balance.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

// use LPT algorithm to balance cells so all warps have similar number
// of compartments.
// NB: Ideally we'd balance so that warps have similar ncycle. But we do not
// know how to predict warp quality without an apriori set of cells to
// fill the warp. For large numbers of cells in a warp,
// it is a justifiable speculation to presume that there will be very
// few holes in warp filling. I.e., ncycle = ncompart/warpsize

#include <algorithm>

#include "coreneuron/nrnconf.h"
#include "coreneuron/network/tnode.hpp"
#include "coreneuron/utils/lpt.hpp"

namespace coreneuron {
int cellorder_nwarp = 0;  // 0 means do not balance

// ordering by warp, then old order
bool warpcmp(const TNode* a, const TNode* b) {
    if (a->groupindex < b->groupindex) {
        return true;
    } else if (a->groupindex == b->groupindex && a->nodevec_index < b->nodevec_index) {
        return true;
    }
    return false;
}

// order the ncell nodevec roots for balance and return a displacement
// vector specifying the contiguous roots for a warp.
// The return vector should be freed by the caller.
// On entry, nodevec is ordered so that each cell type is together and
// largest cells first. On exit, nodevec is ordered so that warp i
// should contain roots nodevec[displ[i]:displ[i+1]]

size_t warp_balance(size_t ncell, VecTNode& nodevec) {
    if (ncell == 0) {
        return 0;
    }

    if (cellorder_nwarp == 0) {
        return 0;
    }
    size_t nwarp = size_t(cellorder_nwarp);
    // cannot be more warps than cells
    nwarp = std::min(ncell, nwarp);

    // cellsize vector and location of types.
    std::vector<size_t> cellsize(ncell);
    std::vector<size_t> typedispl;
    size_t total_compart = 0;
    typedispl.push_back(0);  // types are already in order
    for (size_t i = 0; i < ncell; ++i) {
        cellsize[i] = nodevec[i]->treesize;
        total_compart += cellsize[i];
        if (i == 0 || nodevec[i]->hash != nodevec[i - 1]->hash) {
            typedispl.push_back(typedispl.back() + 1);
        } else {
            typedispl.back() += 1;
        }
    }

    size_t ideal_compart_per_warp = total_compart / nwarp;

    size_t min_cells_per_warp = 0;
    for (size_t i = 0, sz = 0; sz < ideal_compart_per_warp; ++i) {
        ++min_cells_per_warp;
        sz += cellsize[i];
    }

    // balance when order is unrestricted (identical cells not together)
    // i.e. pieces are cellsize
    double best_balance = 0.0;
    auto inwarp = lpt(nwarp, cellsize, &best_balance);
    printf("best_balance=%g ncell=%ld ntype=%ld nwarp=%ld\n",
           best_balance,
           ncell,
           typedispl.size() - 1,
           nwarp);

    // order the roots for balance
    for (size_t i = 0; i < ncell; ++i) {
        TNode* nd = nodevec[i];
        nd->groupindex = inwarp[i];
    }
    std::sort(nodevec.begin(), nodevec.begin() + ncell, warpcmp);
    for (size_t i = 0; i < nodevec.size(); ++i) {
        TNode* nd = nodevec[i];
        for (size_t j = 0; j < nd->children.size(); ++j) {
            nd->children[j]->groupindex = nd->groupindex;
        }
        nd->nodevec_index = i;
    }

    return nwarp;
}
}  // namespace coreneuron


================================================
FILE: coreneuron/permute/cellorder.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#include "coreneuron/nrnconf.h"
#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/utils/nrn_assert.h"
#include "coreneuron/permute/cellorder.hpp"
#include "coreneuron/network/tnode.hpp"
#include "coreneuron/utils/lpt.hpp"
#include "coreneuron/utils/memory.h"
#include "coreneuron/utils/offload.hpp"
#include "coreneuron/apps/corenrn_parameters.hpp"

#include "coreneuron/permute/node_permute.h"  // for print_quality

#ifdef _OPENACC
#include <openacc.h>
#endif

#include <set>

namespace coreneuron {
int interleave_permute_type;
InterleaveInfo* interleave_info;  // nrn_nthread array


void InterleaveInfo::swap(InterleaveInfo& info) {
    std::swap(nwarp, info.nwarp);
    std::swap(nstride, info.nstride);

    std::swap(stridedispl, info.stridedispl);
    std::swap(stride, info.stride);
    std::swap(firstnode, info.firstnode);
    std::swap(lastnode, info.lastnode);
    std::swap(cellsize, info.cellsize);

    std::swap(nnode, info.nnode);
    std::swap(ncycle, info.ncycle);
    std::swap(idle, info.idle);
    std::swap(cache_access, info.cache_access);
    std::swap(child_race, info.child_race);
}

InterleaveInfo::InterleaveInfo(const InterleaveInfo& info) {
    nwarp = info.nwarp;
    nstride = info.nstride;

    copy_align_array(stridedispl, info.stridedispl, nwarp + 1);
    copy_align_array(stride, info.stride, nstride);
    copy_align_array(firstnode, info.firstnode, nwarp + 1);
    copy_align_array(lastnode, info.lastnode, nwarp + 1);
    copy_align_array(cellsize, info.cellsize, nwarp);

    copy_array(nnode, info.nnode, nwarp);
    copy_array(ncycle, info.ncycle, nwarp);
    copy_array(idle, info.idle, nwarp);
    copy_array(cache_access, info.cache_access, nwarp);
    copy_array(child_race, info.child_race, nwarp);
}

InterleaveInfo& InterleaveInfo::operator=(const InterleaveInfo& info) {
    // self assignment
    if (this == &info)
        return *this;

    InterleaveInfo temp(info);

    this->swap(temp);
    return *this;
}

InterleaveInfo::~InterleaveInfo() {
    if (stride) {
        free_memory(stride);
        free_memory(firstnode);
        free_memory(lastnode);
        free_memory(cellsize);
    }
    if (stridedispl) {
        free_memory(stridedispl);
    }
    if (idle) {
        delete[] nnode;
        delete[] ncycle;
        delete[] idle;
        delete[] cache_access;
        delete[] child_race;
    }
}

void create_interleave_info() {
    destroy_interleave_info();
    interleave_info = new InterleaveInfo[nrn_nthread];
}

void destroy_interleave_info() {
    if (interleave_info) {
        delete[] interleave_info;
        interleave_info = nullptr;
    }
}

// more precise visualization of the warp quality
// can be called after admin2
static void print_quality2(int iwarp, InterleaveInfo& ii, int* p) {
    int pc = (iwarp == 0);  // print warp 0
    pc = 0;                 // turn off printing
    int nodebegin = ii.lastnode[iwarp];
    int* stride = ii.stride + ii.stridedispl[iwarp];
    int ncycle = ii.cellsize[iwarp];

    int inode = nodebegin;

    size_t nn = 0;  // number of nodes in warp. '.'
    size_t nx = 0;  // number of idle cores on all cycles. 'X'
    size_t ncacheline = 0;
    ;                // number of parent memory cacheline accesses.
                     //   assmue warpsize is max number in a cachline so all o
    size_t ncr = 0;  // number of child race. nchild-1 of same parent in same cycle

    for (int icycle = 0; icycle < ncycle; ++icycle) {
        int s = stride[icycle];
        int lastp = -2;
        if (pc)
            printf("  ");
        std::set<int> crace;  // how many children have same parent in a cycle
        for (int icore = 0; icore < warpsize; ++icore) {
            char ch = '.';
            if (icore < s) {
                int par = p[inode];
                if (crace.find(par) != crace.end()) {
                    ch = 'r';
                    ++ncr;
                } else {
                    crace.insert(par);
                }

                if (par != lastp + 1) {
                    ch = (ch == 'r') ? 'R' : 'o';
                    ++ncacheline;
                }
                lastp = p[inode++];
                ++nn;
            } else {
                ch = 'X';
                ++nx;
            }
            if (pc)
                printf("%c", ch);
        }
        if (pc)
            printf("\n");
    }

    ii.nnode[iwarp] = nn;
    ii.ncycle[iwarp] = size_t(ncycle);
    ii.idle[iwarp] = nx;
    ii.cache_access[iwarp] = ncacheline;
    ii.child_race[iwarp] = ncr;
    if (pc)
        printf("warp %d:  %ld nodes, %d cycles, %ld idle, %ld cache access, %ld child races\n",
               iwarp,
               nn,
               ncycle,
               nx,
               ncacheline,
               ncr);
}

static void print_quality1(int iwarp, InterleaveInfo& ii, int ncell, int* p) {
    int pc = ((iwarp == 0) || iwarp == (ii.nwarp - 1));  // warp not to skip printing
    pc = 0;                                              // turn off printing.
    int* stride = ii.stride;
    int cellbegin = iwarp * warpsize;
    int cellend = cellbegin + warpsize;
    cellend = (cellend < stride[0]) ? cellend : stride[0];

    int ncycle = 0;
    for (int i = cellbegin; i < cellend; ++i) {
        if (ncycle < ii.cellsize[i]) {
            ncycle = ii.cellsize[i];
        }
    }
    nrn_assert(ncycle == ii.cellsize[cellend - 1]);
    nrn_assert(ncycle <= ii.nstride);

    int ncell_in_warp = cellend - cellbegin;

    size_t n = 0;   // number of nodes in warp (not including roots)
    size_t nx = 0;  // number of idle cores on all cycles. X
    size_t ncacheline = 0;
    ;  // number of parent memory cacheline accesses.
       // assume warpsize is max number in a cachline so
       // first core has all o

    int inode = ii.firstnode[cellbegin];
    for (int icycle = 0; icycle < ncycle; ++icycle) {
        int sbegin = ncell - stride[icycle] - cellbegin;
        int lastp = -2;
        if (pc)
            printf("  ");
        for (int icore = 0; icore < warpsize; ++icore) {
            char ch = '.';
            if (icore < ncell_in_warp && icore >= sbegin) {
                int par = p[inode + icore];
                if (par != lastp + 1) {
                    ch = 'o';
                    ++ncacheline;
                }
                lastp = par;
                ++n;
            } else {
                ch = 'X';
                ++nx;
            }
            if (pc)
                printf("%c", ch);
        }
        if (pc)
            printf("\n");
        inode += ii.stride[icycle + 1];
    }

    ii.nnode[iwarp] = n;
    ii.ncycle[iwarp] = (size_t) ncycle;
    ii.idle[iwarp] = nx;
    ii.cache_access[iwarp] = ncacheline;
    ii.child_race[iwarp] = 0;
    if (pc)
        printf("warp %d:  %ld nodes, %d cycles, %ld idle, %ld cache access\n",
               iwarp,
               n,
               ncycle,
               nx,
               ncacheline);
}

static void warp_balance(int ith, InterleaveInfo& ii) {
    size_t nwarp = size_t(ii.nwarp);
    size_t smm[4][3];  // sum_min_max see cp below
    for (size_t j = 0; j < 4; ++j) {
        smm[j][0] = 0;
        smm[j][1] = 1000000000;
        smm[j][2] = 0;
    }
    double emax = 0.0, emin = 1.0;
    for (size_t i = 0; i < nwarp; ++i) {
        size_t n = ii.nnode[i];
        double e = double(n) / (n + ii.idle[i]);
        if (emax < e) {
            emax = e;
        }
        if (emin > e) {
            emin = e;
        }
        size_t s[4] = {n, ii.idle[i], ii.cache_access[i], ii.child_race[i]};
        for (size_t j = 0; j < 4; ++j) {
            smm[j][0] += s[j];
            if (smm[j][1] > s[j]) {
                smm[j][1] = s[j];
            }
            if (smm[j][2] < s[j]) {
                smm[j][2] = s[j];
            }
        }
    }
    std::vector<size_t> v(nwarp);
    for (size_t i = 0; i < nwarp; ++i) {
        v[i] = ii.ncycle[i];
    }
    double bal = load_balance(v);
#ifdef DEBUG
    printf(
        "thread %d nwarp=%ld  balance=%g  warp_efficiency %g to %g\n", ith, nwarp, bal, emin, emax);
    const char* cp[4] = {"nodes", "idle", "ca", "cr"};
    for (size_t i = 0; i < 4; ++i) {
        printf("  %s=%ld (%ld:%ld)", cp[i], smm[i][0], smm[i][1], smm[i][2]);
    }
    printf("\n");
#else
    (void) bal;  // Remove warning about unused
#endif
}

int* interleave_order(int ith, int ncell, int nnode, int* parent) {
    // return if there are no nodes to permute
    if (nnode <= 0)
        return nullptr;

    // ensure parent of root = -1
    for (int i = 0; i < ncell; ++i) {
        if (parent[i] == 0) {
            parent[i] = -1;
        }
    }

    int nwarp = 0, nstride = 0, *stride = nullptr, *firstnode = nullptr;
    int *lastnode = nullptr, *cellsize = nullptr, *stridedispl = nullptr;

    int* order = node_order(
        ncell, nnode, parent, nwarp, nstride, stride, firstnode, lastnode, cellsize, stridedispl);

    if (interleave_info) {
        InterleaveInfo& ii = interleave_info[ith];
        ii.nwarp = nwarp;
        ii.nstride = nstride;
        ii.stridedispl = stridedispl;
        ii.stride = stride;
        ii.firstnode = firstnode;
        ii.lastnode = lastnode;
        ii.cellsize = cellsize;
        if (0 && ith == 0 && interleave_permute_type == 1) {
            printf("ith=%d nstride=%d ncell=%d nnode=%d\n", ith, nstride, ncell, nnode);
            for (int i = 0; i < ncell; ++i) {
                printf("icell=%d cellsize=%d first=%d last=%d\n",
                       i,
                       cellsize[i],
                       firstnode[i],
                       lastnode[i]);
            }
            for (int i = 0; i < nstride; ++i) {
                printf("istride=%d stride=%d\n", i, stride[i]);
            }
        }
        if (ith == 0) {
            // needed for print_quality[12] and done once here to save time
            int* p = new int[nnode];
            for (int i = 0; i < nnode; ++i) {
                p[i] = parent[i];
            }
            permute_ptr(p, nnode, order);
            node_permute(p, nnode, order);

            ii.nnode = new size_t[nwarp];
            ii.ncycle = new size_t[nwarp];
            ii.idle = new size_t[nwarp];
            ii.cache_access = new size_t[nwarp];
            ii.child_race = new size_t[nwarp];
            for (int i = 0; i < nwarp; ++i) {
                if (interleave_permute_type == 1) {
                    print_quality1(i, interleave_info[ith], ncell, p);
                }
                if (interleave_permute_type == 2) {
                    print_quality2(i, interleave_info[ith], p);
                }
            }
            delete[] p;
            warp_balance(ith, interleave_info[ith]);
        }
    }

    return order;
}

#if INTERLEAVE_DEBUG  // only the cell per core style
static int** cell_indices_debug(NrnThread& nt, InterleaveInfo& ii) {
    int ncell = nt.ncell;
    int nnode = nt.end;
    int* parents = nt._v_parent_index;

    // we expect the nodes to be interleave ordered with smallest cell first
    // establish consistency with ii.
    // first ncell parents are -1
    for (int i = 0; i < ncell; ++i) {
        nrn_assert(parents[i] == -1);
    }
    int* sz = new int[ncell];
    int* cell = new int[nnode];
    for (int i = 0; i < ncell; ++i) {
        sz[i] = 0;
        cell[i] = i;
    }
    for (int i = ncell; i < nnode; ++i) {
        cell[i] = cell[parents[i]];
        sz[cell[i]] += 1;
    }

    // cells are in inceasing sz order;
    for (int i = 1; i < ncell; ++i) {
        nrn_assert(sz[i - 1] <= sz[i]);
    }
    // same as ii.cellsize
    for (int i = 0; i < ncell; ++i) {
        nrn_assert(sz[i] == ii.cellsize[i]);
    }

    int** cellindices = new int*[ncell];
    for (int i = 0; i < ncell; ++i) {
        cellindices[i] = new int[sz[i]];
        sz[i] = 0;  // restart sz counts
    }
    for (int i = ncell; i < nnode; ++i) {
        cellindices[cell[i]][sz[cell[i]]] = i;
        sz[cell[i]] += 1;
    }
    // cellindices first and last same as ii first and last
    for (int i = 0; i < ncell; ++i) {
        nrn_assert(cellindices[i][0] == ii.firstnode[i]);
        nrn_assert(cellindices[i][sz[i] - 1] == ii.lastnode[i]);
    }

    delete[] sz;
    delete[] cell;

    return cellindices;
}

static int*** cell_indices_threads;
void mk_cell_indices() {
    cell_indices_threads = new int**[nrn_nthread];
    for (int i = 0; i < nrn_nthread; ++i) {
        NrnThread& nt = nrn_threads[i];
        if (nt.ncell) {
            cell_indices_threads[i] = cell_indices_debug(nt, interleave_info[i]);
        } else {
            cell_indices_threads[i] = nullptr;
        }
    }
}
#endif  // INTERLEAVE_DEBUG

#define GPU_V(i)      nt->_actual_v[i]
#define GPU_A(i)      nt->_actual_a[i]
#define GPU_B(i)      nt->_actual_b[i]
#define GPU_D(i)      nt->_actual_d[i]
#define GPU_RHS(i)    nt->_actual_rhs[i]
#define GPU_PARENT(i) nt->_v_parent_index[i]

// How does the interleaved permutation with stride get used in
// triagularization?

// each cell in parallel regardless of inhomogeneous topology
static void triang_interleaved(NrnThread* nt,
                               int icell,
                               int icellsize,
                               int nstride,
                               int* stride,
                               int* lastnode) {
    int i = lastnode[icell];
    for (int istride = nstride - 1; istride >= 0; --istride) {
        if (istride < icellsize) {  // only first icellsize strides matter
            // what is the index
            int ip = GPU_PARENT(i);
#ifndef CORENEURON_ENABLE_GPU
            nrn_assert(ip >= 0);  // if (ip < 0) return;
#endif
            double p = GPU_A(i) / GPU_D(i);
            GPU_D(ip) -= p * GPU_B(i);
            GPU_RHS(ip) -= p * GPU_RHS(i);
            i -= stride[istride];
        }
    }
}

// back substitution?
static void bksub_interleaved(NrnThread* nt,
                              int icell,
                              int icellsize,
                              int /* nstride */,
                              int* stride,
                              int* firstnode) {
    int i = firstnode[icell];
    GPU_RHS(icell) /= GPU_D(icell);  // the root
    for (int istride = 0; istride < icellsize; ++istride) {
        int ip = GPU_PARENT(i);
#ifndef CORENEURON_ENABLE_GPU
        nrn_assert(ip >= 0);
#endif
        GPU_RHS(i) -= GPU_B(i) * GPU_RHS(ip);
        GPU_RHS(i) /= GPU_D(i);
        i += stride[istride + 1];
    }
}

// icore ranges [0:warpsize) ; stride[ncycle]
nrn_pragma_acc(routine vector)
static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* stride, int lastnode) {
    int icycle = ncycle - 1;
    int istride = stride[icycle];
    int i = lastnode - istride + icore;
    int ii = i;

    // execute until all tree depths are executed
    bool has_subtrees_to_compute = true;

    // clang-format off
    nrn_pragma_acc(loop seq)
    for (; has_subtrees_to_compute; ) {  // ncycle loop
        // serial test, gpu does this in parallel
        nrn_pragma_acc(loop vector)
        nrn_pragma_omp(loop bind(parallel))
        for (int icore = 0; icore < warpsize; ++icore) {
            int i = ii + icore;
            if (icore < istride) {  // most efficient if istride equal  warpsize
                // what is the index
                int ip = GPU_PARENT(i);
                double p = GPU_A(i) / GPU_D(i);
                nrn_pragma_acc(atomic update)
                nrn_pragma_omp(atomic update)
                GPU_D(ip) -= p * GPU_B(i);
                nrn_pragma_acc(atomic update)
                nrn_pragma_omp(atomic update)
                GPU_RHS(ip) -= p * GPU_RHS(i);
            }
        }
        // if finished with all tree depths then ready to break
        // (note that break is not allowed in OpenACC)
        if (icycle == 0) {
            has_subtrees_to_compute = false;
            continue;
        }
        --icycle;
        istride = stride[icycle];
        i -= istride;
        ii -= istride;
    }
}

// icore ranges [0:warpsize) ; stride[ncycle]
nrn_pragma_acc(routine vector)
static void bksub_interleaved2(NrnThread* nt,
                               int root,
                               int lastroot,
                               int icore,
                               int ncycle,
                               int* stride,
                               int firstnode) {
    nrn_pragma_acc(loop seq)
    for (int i = root; i < lastroot; i += 1) {
        GPU_RHS(i) /= GPU_D(i);  // the root
    }

    int i = firstnode + icore;
    int ii = i;
    nrn_pragma_acc(loop seq)
    for (int icycle = 0; icycle < ncycle; ++icycle) {
        int istride = stride[icycle];
        // serial test, gpu does this in parallel
        nrn_pragma_acc(loop vector)
        nrn_pragma_omp(loop bind(parallel))
        for (int icore = 0; icore < warpsize; ++icore) {
            int i = ii + icore;
            if (icore < istride) {
                int ip = GPU_PARENT(i);
                GPU_RHS(i) -= GPU_B(i) * GPU_RHS(ip);
                GPU_RHS(i) /= GPU_D(i);
            }
            i += istride;
        }
        ii += istride;
    }
}

/**
 * \brief Solve Hines matrices/cells with compartment-based granularity.
 *
 * The node ordering/permuation guarantees cell interleaving (as much coalesced memory access as
 * possible) and balanced warps (through the use of lpt algorithm to define the groups/warps). Every
 * warp deals with a group of cells, therefore multiple compartments (finer level of parallelism).
 */
void solve_interleaved2(int ith) {
    NrnThread* nt = nrn_threads + ith;
    InterleaveInfo& ii = interleave_info[ith];
    int nwarp = ii.nwarp;
    if (nwarp == 0)
        return;

    int ncore = nwarp * warpsize;

#ifdef _OPENACC
    if (corenrn_param.gpu && corenrn_param.cuda_interface) {
        auto* d_nt = static_cast<NrnThread*>(acc_deviceptr(nt));
        auto* d_info = static_cast<InterleaveInfo*>(acc_deviceptr(interleave_info + ith));
        solve_interleaved2_launcher(d_nt, d_info, ncore, acc_get_cuda_stream(nt->stream_id));
    } else {
#endif
        int* ncycles = ii.cellsize;         // nwarp of these
        int* stridedispl = ii.stridedispl;  // nwarp+1 of these
        int* strides = ii.stride;           // sum ncycles of these (bad since ncompart/warpsize)
        int* rootbegin = ii.firstnode;      // nwarp+1 of these
        int* nodebegin = ii.lastnode;       // nwarp+1 of these
#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
    defined(_OPENACC)
        int nstride = stridedispl[nwarp];
#endif
        /* If we compare this loop with the one from cellorder.cu (CUDA version), we will understand 
         * that the parallelism here is exposed in steps, while in the CUDA version all the parallelism 
         * is exposed from the very beginning of the loop. In more details, here we initially distribute
         * the outermost loop, e.g. in the CUDA blocks, and for the innermost loops we explicitly use multiple
         * threads for the parallelization (see for example the loop directives in triang/bksub_interleaved2). 
         * On the other hand, in the CUDA version the outermost loop is distributed to all the available threads,
         * and therefore there is no need to have the innermost loops. Here, the loop/icore jumps every warpsize,
         * while in the CUDA version the icore increases by one. Other than this, the two loop versions
         * are equivalent (same results).
         */
        nrn_pragma_acc(parallel loop gang present(nt [0:1],
                              strides [0:nstride],
                              ncycles [0:nwarp],
                              stridedispl [0:nwarp + 1],
                              rootbegin [0:nwarp + 1],
                              nodebegin [0:nwarp + 1]) if (nt->compute_gpu) async(nt->stream_id))
        nrn_pragma_omp(target teams loop if(nt->compute_gpu))
        for (int icore = 0; icore < ncore; icore += warpsize) {
            int iwarp = icore / warpsize;     // figure out the >> value
            int ic = icore & (warpsize - 1);  // figure out the & mask
            int ncycle = ncycles[iwarp];
            int* stride = strides + stridedispl[iwarp];
            int root = rootbegin[iwarp];  // cell ID -> [0, ncell)
            int lastroot = rootbegin[iwarp + 1];
            int firstnode = nodebegin[iwarp];
            int lastnode = nodebegin[iwarp + 1];
            
            triang_interleaved2(nt, ic, ncycle, stride, lastnode);
            bksub_interleaved2(nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
        }
        nrn_pragma_acc(wait(nt->stream_id))
#ifdef _OPENACC
    }
#endif
}

/**
 * \brief Solve Hines matrices/cells with cell-based granularity.
 *
 * The node ordering guarantees cell interleaving (as much coalesced memory access as possible),
 * but parallelism granularity is limited to a per cell basis. Therefore every execution stream
 * is mapped to a cell/tree.
 */
void solve_interleaved1(int ith) {
    NrnThread* nt = nrn_threads + ith;
    int ncell = nt->ncell;
    if (ncell == 0) {
        return;
    }
    InterleaveInfo& ii = interleave_info[ith];
    int nstride = ii.nstride;
    int* stride = ii.stride;
    int* firstnode = ii.firstnode;
    int* lastnode = ii.lastnode;
    int* cellsize = ii.cellsize;

    // OL211123: can we preserve the error checking behaviour of OpenACC's
    // present clause with OpenMP? It is a bug if these data are not present,
    // so diagnostics are helpful...
    nrn_pragma_acc(parallel loop present(nt [0:1],
                                         stride [0:nstride],
                                         firstnode [0:ncell],
                                         lastnode [0:ncell],
                                         cellsize [0:ncell]) if (nt->compute_gpu)
                       async(nt->stream_id))
    nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
    for (int icell = 0; icell < ncell; ++icell) {
        int icellsize = cellsize[icell];
        triang_interleaved(nt, icell, icellsize, nstride, stride, lastnode);
        bksub_interleaved(nt, icell, icellsize, nstride, stride, firstnode);
    }
    nrn_pragma_acc(wait(nt->stream_id))
}

void solve_interleaved(int ith) {
    if (interleave_permute_type != 1) {
        solve_interleaved2(ith);
    } else {
        solve_interleaved1(ith);
    }
}
}  // namespace coreneuron


================================================
FILE: coreneuron/permute/cellorder.cu
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#include "coreneuron/utils/utils_cuda.h"
#include "coreneuron/permute/cellorder.hpp"
#include "coreneuron/network/tnode.hpp"
#include "coreneuron/sim/multicore.hpp"

namespace coreneuron {

__device__ void triang_interleaved2_device(NrnThread* nt,
                                           int icore,
                                           int ncycle,
                                           int* stride,
                                           int lastnode) {
    int icycle = ncycle - 1;
    int istride = stride[icycle];
    int i = lastnode - istride + icore;

    int ip;
    double p;
    while (icycle >= 0) {
        // most efficient if istride equal warpsize, else branch divergence!
        if (icore < istride) {
            ip = nt->_v_parent_index[i];
            p = nt->_actual_a[i] / nt->_actual_d[i];
            atomicAdd(&nt->_actual_d[ip], -p * nt->_actual_b[i]);
            atomicAdd(&nt->_actual_rhs[ip], -p * nt->_actual_rhs[i]);
        }
        --icycle;
        istride = stride[icycle];
        i -= istride;
    }
}

__device__ void bksub_interleaved2_device(NrnThread* nt,
                                          int root,
                                          int lastroot,
                                          int icore,
                                          int ncycle,
                                          int* stride,
                                          int firstnode) {
    for (int i = root; i < lastroot; i += warpsize) {
        nt->_actual_rhs[i] /= nt->_actual_d[i];  // the root
    }

    int i = firstnode + icore;

    int ip;
    for (int icycle = 0; icycle < ncycle; ++icycle) {
        int istride = stride[icycle];
        if (icore < istride) {
            ip = nt->_v_parent_index[i];
            nt->_actual_rhs[i] -= nt->_actual_b[i] * nt->_actual_rhs[ip];
            nt->_actual_rhs[i] /= nt->_actual_d[i];
        }
        i += istride;
    }
}

__global__ void solve_interleaved2_kernel(NrnThread* nt, InterleaveInfo* ii, int ncore) {
    int icore = blockDim.x * blockIdx.x + threadIdx.x;

    int* ncycles = ii->cellsize;         // nwarp of these
    int* stridedispl = ii->stridedispl;  // nwarp+1 of these
    int* strides = ii->stride;           // sum ncycles of these (bad since ncompart/warpsize)
    int* rootbegin = ii->firstnode;      // nwarp+1 of these
    int* nodebegin = ii->lastnode;       // nwarp+1 of these

    while (icore < ncore) {
        int iwarp = icore / warpsize;     // figure out the >> value
        int ic = icore & (warpsize - 1);  // figure out the & mask
        int ncycle = ncycles[iwarp];
        int* stride = strides + stridedispl[iwarp];
        int root = rootbegin[iwarp];
        int lastroot = rootbegin[iwarp + 1];
        int firstnode = nodebegin[iwarp];
        int lastnode = nodebegin[iwarp + 1];

        triang_interleaved2_device(nt, ic, ncycle, stride, lastnode);
        bksub_interleaved2_device(nt, root + ic, lastroot, ic, ncycle, stride, firstnode);

        icore += blockDim.x * gridDim.x;
    }
}

void solve_interleaved2_launcher(NrnThread* nt, InterleaveInfo* info, int ncore, void* stream) {
    auto cuda_stream = static_cast<cudaStream_t>(stream);

    /// the selection of these parameters has been done after running the channel-benchmark for
    /// typical production runs, i.e. 1 MPI task with 1440 cells & 6 MPI tasks with 8800 cells.
    /// In the OpenACC/OpenMP implementations threadsPerBlock is set to 32. From profiling the
    /// channel-benchmark circuits mentioned above we figured out that the best performance was
    /// achieved with this configuration
    int threadsPerBlock = warpsize;
    /// Max number of blocksPerGrid for NVIDIA GPUs is 65535, so we need to make sure that the
    /// blocksPerGrid we launch the CUDA kernel with doesn't exceed this number
    const auto maxBlocksPerGrid = 65535;
    int provisionalBlocksPerGrid = (ncore + threadsPerBlock - 1) / threadsPerBlock;
    int blocksPerGrid = provisionalBlocksPerGrid <= maxBlocksPerGrid ? provisionalBlocksPerGrid
                                                                     : maxBlocksPerGrid;

    solve_interleaved2_kernel<<<blocksPerGrid, threadsPerBlock, 0, cuda_stream>>>(nt, info, ncore);

    cudaStreamSynchronize(cuda_stream);

    CHECKLAST("solve_interleaved2_launcher");
}

}  // namespace coreneuron


================================================
FILE: coreneuron/permute/cellorder.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#pragma once

#include "coreneuron/utils/memory.h"
#include <algorithm>
namespace coreneuron {

/**
 * \brief Function that performs the permutation of the cells such that the
 *        execution threads access coalesced memory.
 *
 * \param ith NrnThread to access
 * \param ncell number of cells in NrnThread
 * \param nnode number of compartments in the ncells
 * \param parent parent indices of cells
 *
 * \return int* order, interleaved order of the cells
 */
int* interleave_order(int ith, int ncell, int nnode, int* parent);

void create_interleave_info();
void destroy_interleave_info();

/**
 *
 * \brief Solve the Hines matrices based on the interleave_permute_type (1 or 2).
 *
 * For interleave_permute_type == 1 : Naive interleaving -> Each execution thread deals with one
 * Hines matrix (cell) For interleave_permute_type == 2 : Advanced interleaving -> Each Hines matrix
 * is solved by multiple execution threads (with coalesced memory access as well)
 */
extern void solve_interleaved(int ith);

class InterleaveInfo;  // forward declaration
/**
 *
 * \brief CUDA branch of the solve_interleaved with interleave_permute_type == 2.
 *
 * This branch is activated in runtime with the --cuda-interface CLI flag
 */
void solve_interleaved2_launcher(NrnThread* nt, InterleaveInfo* info, int ncore, void* stream);

class InterleaveInfo: public MemoryManaged {
  public:
    InterleaveInfo() = default;
    InterleaveInfo(const InterleaveInfo&);
    InterleaveInfo& operator=(const InterleaveInfo&);
    ~InterleaveInfo();
    int nwarp = 0;  // used only by interleave2
    int nstride = 0;
    int* stridedispl = nullptr;  // interleave2: nwarp+1
    int* stride = nullptr;       // interleave2: stride  length is stridedispl[nwarp]
    int* firstnode = nullptr;    // interleave2: rootbegin nwarp+1 displacements
    int* lastnode = nullptr;     // interleave2: nodebegin nwarp+1 displacements
    int* cellsize = nullptr;     // interleave2: ncycles nwarp

    // statistics (nwarp of each)
    size_t* nnode = nullptr;
    size_t* ncycle = nullptr;
    size_t* idle = nullptr;
    size_t* cache_access = nullptr;
    size_t* child_race = nullptr;

  private:
    void swap(InterleaveInfo& info);
};

/**
 * \brief Function that returns a permutation of length nnode.
 *
 * There are two permutation strategies:
 * For interleave_permute_type == 1 : Naive interleaving -> Each execution thread deals with one
 * Hines matrix (cell) For interleave_permute_type == 2 : Advanced interleaving -> Each Hines matrix
 * is solved by multiple execution threads (with coalesced memory access as well)
 *
 * \param ncell number of cells
 * \param nnode number of compartments in the ncells
 * \param parents parent indices of the cells
 * \param nwarp number of warps
 * \param nstride nstride is the maximum cell size (not counting root)
 * \param stride stride[i] is the number of cells with an ith node:
 *               using stride[i] we know how many positions to move in order to
 *               access the next element of the same cell (given that the cells are
 *               ordered with the treenode_order).
 * \param firstnode firstnode[i] is the index of the first nonroot node of the cell
 * \param lastnode lastnode[i] is the index of the last node of the cell
 * \param cellsize cellsize is the number of nodes in the cell not counting root.
 * \param stridedispl
 * \return int* : a permutation of length nnode
 */
int* node_order(int ncell,
                int nnode,
                int* parents,
                int& nwarp,
                int& nstride,
                int*& stride,
                int*& firstnode,
                int*& lastnode,
                int*& cellsize,
                int*& stridedispl);

// copy src array to dest with new allocation
template <typename T>
void copy_array(T*& dest, T* src, size_t n) {
    dest = new T[n];
    std::copy(src, src + n, dest);
}

// copy src array to dest with NRN_SOA_BYTE_ALIGN ecalloc_align allocation
template <typename T>
void copy_align_array(T*& dest, T* src, size_t n) {
    dest = static_cast<T*>(ecalloc_align(n, sizeof(T)));
    std::copy(src, src + n, dest);
}

#ifndef INTERLEAVE_DEBUG
#define INTERLEAVE_DEBUG 0
#endif

#if INTERLEAVE_DEBUG
void mk_cell_indices();
#endif
}  // namespace coreneuron


================================================
FILE: coreneuron/permute/cellorder1.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#include <cstdio>
#include <map>
#include <set>
#include <algorithm>
#include <cstring>

#include "coreneuron/utils/nrn_assert.h"
#include "coreneuron/permute/cellorder.hpp"
#include "coreneuron/network/tnode.hpp"

// just for interleave_permute_type
#include "coreneuron/nrniv/nrniv_decl.h"
#include "coreneuron/utils/memory.h"


namespace coreneuron {
static size_t groupsize = 32;

/**
 * \brief Function to order trees by size, hash and nodeindex
 */
static bool tnode_earlier(TNode* a, TNode* b) {
    bool result = false;
    if (a->treesize < b->treesize) {  // treesize dominates
        result = true;
    } else if (a->treesize == b->treesize) {
        if (a->hash < b->hash) {  // if treesize same, keep identical trees together
            result = true;
        } else if (a->hash == b->hash) {
            result = a->nodeindex < b->nodeindex;  // identical trees ordered by nodeindex
        }
    }
    return result;
}

static bool ptr_tnode_earlier(TNode* a, TNode* b) {
    return tnode_earlier(a, b);
}

TNode::TNode(int ix) {
    nodeindex = ix;
    cellindex = 0;
    groupindex = 0;
    level = 0;
    hash = 0;
    treesize = 1;
    nodevec_index = 0;
    treenode_order = 0;
    parent = nullptr;
    children.reserve(2);
}

TNode::~TNode() {}

size_t TNode::mkhash() {  // call on all nodes in leaf to root order
    // concept from http://stackoverflow.com/questions/20511347/a-good-hash-function-for-a-vector
    std::sort(children.begin(), children.end(), ptr_tnode_earlier);
    hash = children.size();
    treesize = 1;
    for (size_t i = 0; i < children.size(); ++i) {  // need sorted by child hash
        hash ^= children[i]->hash + 0x9e3779b9 + (hash << 6) + (hash >> 2);
        treesize += children[i]->treesize;
    }
    return hash;  // hash of leaf nodes is 0
}

static void tree_analysis(int* parent, int nnode, int ncell, VecTNode&);
static void node_interleave_order(int ncell, VecTNode&);
static void admin1(int ncell,
                   VecTNode& nodevec,
                   int& nwarp,
                   int& nstride,
                   int*& stride,
                   int*& firstnode,
                   int*& lastnode,
                   int*& cellsize);
static void admin2(int ncell,
                   VecTNode& nodevec,
                   int& nwarp,
                   int& nstride,
                   int*& stridedispl,
                   int*& strides,
                   int*& rootbegin,
                   int*& nodebegin,
                   int*& ncycles);
static void check(VecTNode&);
#if CORENRN_DEBUG
static void prtree(VecTNode&);
#endif

using TNI = std::pair<TNode*, int>;
using HashCnt = std::map<size_t, std::pair<TNode*, int>>;
using TNIVec = std::vector<TNI>;

/*
assess the quality of the ordering. The measure is the size of a contiguous
list of nodes whose parents have the same order. How many contiguous lists
have that same size. How many nodes participate in that size list.
Modify the quality measure from experience with performance. Start with
list of (nnode, size_participation)
*/
static void quality(VecTNode& nodevec, size_t max = 32) {
    size_t qcnt = 0;  // how many contiguous nodes have contiguous parents

    // first ncell nodes are by definition in contiguous order
    for (const auto& n: nodevec) {
        if (n->parent != nullptr) {
            break;
        }
        qcnt += 1;
    }
    size_t ncell = qcnt;

    // key is how many parents in contiguous order
    // value is number of nodes that participate in that
    std::map<size_t, size_t> qual;
    size_t ip_last = 10000000000;
    for (size_t i = ncell; i < nodevec.size(); ++i) {
        size_t ip = nodevec[i]->parent->nodevec_index;
        // i%max == 0 means that if we start a warp with 8 and then have 32
        // the 32 is broken into 24 and 8. (modify if the arrangement during
        // gaussian elimination becomes more sophisticated.(
        if (ip == ip_last + 1 && i % max != 0) {  // contiguous
            qcnt += 1;
        } else {
            if (qcnt == 1) {
                // printf("unique %ld p=%ld ix=%d\n", i, ip, nodevec[i]->nodeindex);
            }
            qual[max] += (qcnt / max) * max;
            size_t x = qcnt % max;
            if (x) {
                qual[x] += x;
            }
            qcnt = 1;
        }
        ip_last = ip;
    }
    qual[max] += (qcnt / max) * max;
    size_t x = qcnt % max;
    if (x) {
        qual[x] += x;
    }

    // print result
    qcnt = 0;
#if CORENRN_DEBUG
    for (const auto& q: qual) {
        qcnt += q.second;
        printf("%6ld %6ld\n", q.first, q.second);
    }
#endif
#if CORENRN_DEBUG
    printf("qual.size=%ld  qual total nodes=%ld  nodevec.size=%ld\n",
           qual.size(),
           qcnt,
           nodevec.size());
#endif

    // how many race conditions. ie refer to same parent on different core
    // of warp (max cores) or parent in same group of max.
    size_t maxip = ncell;
    size_t nrace1 = 0;
    size_t nrace2 = 0;
    std::set<size_t> ipused;
    for (size_t i = ncell; i < nodevec.size(); ++i) {
        TNode* nd = nodevec[i];
        size_t ip = nd->parent->nodevec_index;
        if (i % max == 0) {
            maxip = i;
            ipused.clear();
        }
        if (ip >= maxip) {
            nrace1 += 1;
        } /*else*/
        {
            if (ipused.find(ip) != ipused.end()) {
                nrace2 += 1;
                if (ip >= maxip) {
                    // printf("race for parent %ld (parent in same group as multiple users))\n",
                    // ip);
                }
            } else {
                ipused.insert(ip);
            }
        }
    }
    static_cast<void>(nrace1);
    static_cast<void>(nrace2);
#if CORENRN_DEBUG
    printf("nrace = %ld (parent in same group of %ld nodes)\n", nrace1, max);
    printf("nrace = %ld (parent used more than once by same group of %ld nodes)\n", nrace2, max);
#endif
}

size_t level_from_root(VecTNode& nodevec) {
    size_t maxlevel = 0;
    for (auto& nd: nodevec) {
        if (nd->parent) {
            nd->level = nd->parent->level + 1;
            if (maxlevel < nd->level) {
                maxlevel = nd->level;
            }
        } else {
            nd->level = 0;
        }
    }
    return maxlevel;
}

size_t level_from_leaf(VecTNode& nodevec) {
    size_t maxlevel = 0;
    for (size_t i = nodevec.size() - 1; true; --i) {
        TNode* nd = nodevec[i];
        size_t lmax = 0;
        for (auto& child: nd->children) {
            if (lmax <= child->level) {
                lmax = child->level + 1;
            }
        }
        nd->level = lmax;
        if (maxlevel < lmax) {
            maxlevel = lmax;
        }
        if (i == 0) {
            break;
        }
    }
    return maxlevel;
}

/**
 * \brief Set the cellindex to distinguish the different cells.
 */
static void set_cellindex(int ncell, VecTNode& nodevec) {
    for (int i = 0; i < ncell; ++i) {
        nodevec[i]->cellindex = i;
    }
    for (size_t i = 0; i < nodevec.size(); ++i) {
        TNode& nd = *nodevec[i];
        for (size_t j = 0; j < nd.children.size(); ++j) {
            TNode* cnode = nd.children[j];
            cnode->cellindex = nd.cellindex;
        }
    }
}

/**
 * \brief Initialization of the groupindex (groups)
 *
 * The cells are groupped at a later stage based on a load balancing algorithm.
 * This is just an initialization function.
 */
static void set_groupindex(VecTNode& nodevec) {
    for (size_t i = 0; i < nodevec.size(); ++i) {
        TNode* nd = nodevec[i];
        if (nd->parent) {
            nd->groupindex = nd->parent->groupindex;
        } else {
            nd->groupindex = i / groupsize;
        }
    }
}

// how many identical trees and their levels
// print when more than one instance of a type
// reverse the sense of levels (all leaves are level 0) to get a good
// idea of the depth of identical subtrees.
static void ident_statistic(VecTNode& nodevec, size_t ncell) {
    // reverse sense of levels
    //  size_t maxlevel = level_from_leaf(nodevec);
    size_t maxlevel = level_from_root(nodevec);

    // # in each level
    std::vector<std::vector<size_t>> n_in_level(maxlevel + 1);
    for (auto& n: n_in_level) {
        n.resize(ncell / groupsize);
    }
    for (const auto& n: nodevec) {
        n_in_level[n->level][n->groupindex]++;
    }
    printf("n_in_level.size = %ld\n", n_in_level.size());
    for (size_t i = 0; i < n_in_level.size(); ++i) {
        printf("%5ld\n", i);
        for (const auto& n: n_in_level[i]) {
            printf(" %5ld", n);
        }
        printf("\n");
    }
}
#undef MSS

int* node_order(int ncell,
                int nnode,
                int* parent,
                int& nwarp,
                int& nstride,
                int*& stride,
                int*& firstnode,
                int*& lastnode,
                int*& cellsize,
                int*& stridedispl) {
    VecTNode nodevec;

    // nodevec[0:ncell] in increasing size, with identical trees together,
    // and otherwise nodeindex order
    // nodevec.size = nnode
    tree_analysis(parent, nnode, ncell, nodevec);
    check(nodevec);

    set_cellindex(ncell, nodevec);
    set_groupindex(nodevec);
    level_from_root(nodevec);

    // nodevec[ncell:nnode] cells are interleaved in nodevec[0:ncell] cell order
    if (interleave_permute_type == 1) {
        node_interleave_order(ncell, nodevec);
    } else {
        group_order2(nodevec, groupsize, ncell);
    }
    check(nodevec);

#if CORENRN_DEBUG
    for (int i = 0; i < ncell; ++i) {
        TNode& nd = *nodevec[i];
        printf("%d size=%ld hash=%ld ix=%d\n", i, nd.treesize, nd.hash, nd.nodeindex);
    }
#endif

    if (0)
        ident_statistic(nodevec, ncell);
    quality(nodevec);

    // the permutation
    int* nodeorder = new int[nnode];
    for (int i = 0; i < nnode; ++i) {
        TNode& nd = *nodevec[i];
        nodeorder[nd.nodeindex] = i;
    }

    // administrative statistics for gauss elimination
    if (interleave_permute_type == 1) {
        admin1(ncell, nodevec, nwarp, nstride, stride, firstnode, lastnode, cellsize);
    } else {
        //  admin2(ncell, nodevec, nwarp, nstride, stridedispl, stride, rootbegin, nodebegin,
        //  ncycles);
        admin2(ncell, nodevec, nwarp, nstride, stridedispl, stride, firstnode, lastnode, cellsize);
    }

    int ntopol = 1;
    for (int i = 1; i < ncell; ++i) {
        if (nodevec[i - 1]->hash != nodevec[i]->hash) {
            ntopol += 1;
        }
    }
    static_cast<void>(ntopol);
#ifdef DEBUG
    printf("%d distinct tree topologies\n", ntopol);
#endif

    for (size_t i = 0; i < nodevec.size(); ++i) {
        delete nodevec[i];
    }

    return nodeorder;
}

void check(VecTNode& nodevec) {
    // printf("check\n");
    size_t nnode = nodevec.size();
    size_t ncell = 0;
    for (size_t i = 0; i < nnode; ++i) {
        nodevec[i]->nodevec_index = i;
        if (nodevec[i]->parent == nullptr) {
            ncell++;
        }
    }
    ///  Check that the first compartments of nodevec are the root nodes (cells)
    for (size_t i = 0; i < ncell; ++i) {
        nrn_assert(nodevec[i]->parent == nullptr);
    }
    for (size_t i = ncell; i < nnode; ++i) {
        TNode& nd = *nodevec[i];
        if (nd.parent->nodevec_index >= nd.nodevec_index) {
            printf("error i=%ld nodevec_index=%ld parent=%ld\n",
                   i,
                   nd.nodevec_index,
                   nd.parent->nodevec_index);
        }
        nrn_assert(nd.nodevec_index > nd.parent->nodevec_index);
    }
}

#if CORENRN_DEBUG
void prtree(VecTNode& nodevec) {
    size_t nnode = nodevec.size();
    for (size_t i = 0; i < nnode; ++i) {
        nodevec[i]->nodevec_index = i;
    }
    for (size_t i = 0; i < nnode; ++i) {
        TNode& nd = *nodevec[i];
        printf("%ld p=%d   c=%ld l=%ld o=%ld   ix=%d pix=%d\n",
               i,
               nd.parent ? int(nd.parent->nodevec_index) : -1,
               nd.cellindex,
               nd.level,
               nd.treenode_order,
               nd.nodeindex,
               nd.parent ? int(nd.parent->nodeindex) : -1);
    }
}
#endif

/**
 * \brief Perform tree preparation for interleaving strategies
 *
 * \param parent vector of parent indices
 * \param nnode number of compartments in the cells
 * \param ncell number of cells
 */
void tree_analysis(int* parent, int nnode, int ncell, VecTNode& nodevec) {
    // create empty TNodes (knowing only their index)
    nodevec.reserve(nnode);
    for (int i = 0; i < nnode; ++i) {
        nodevec.push_back(new TNode(i));
    }

    // determine the (sorted by hash) children of each node
    for (int i = nnode - 1; i >= ncell; --i) {
        nodevec[i]->parent = nodevec[parent[i]];
        nodevec[i]->mkhash();
        nodevec[parent[i]]->children.push_back(nodevec[i]);
    }

    // determine hash of the cells
    for (int i = 0; i < ncell; ++i) {
        nodevec[i]->mkhash();
    }

    // sort it by tree size (from smaller to larger)
    std::sort(nodevec.begin(), nodevec.begin() + ncell, tnode_earlier);
}

static bool interleave_comp(TNode* a, TNode* b) {
    bool result = false;
    if (a->treenode_order < b->treenode_order) {
        result = true;
    } else if (a->treenode_order == b->treenode_order) {
        if (a->cellindex < b->cellindex) {
            result = true;
        }
    }
    return result;
}

/**
 * \brief Naive interleaving strategy (interleave_permute_type == 1)
 *
 * Sort so nodevec[ncell:nnode] cell instances are interleaved. Keep the
 * secondary ordering with respect to treenode_order so each cell is still a tree.
 *
 * \param ncell number of cells (trees)
 * \param nodevec vector that contains compartments (nodes of the trees)
 */
void node_interleave_order(int ncell, VecTNode& nodevec) {
    int* order = new int[ncell];
    for (int i = 0; i < ncell; ++i) {
        order[i] = 0;
        nodevec[i]->treenode_order = order[i]++;
    }
    for (size_t i = 0; i < nodevec.size(); ++i) {
        TNode& nd = *nodevec[i];
        for (size_t j = 0; j < nd.children.size(); ++j) {
            TNode* cnode = nd.children[j];
            cnode->treenode_order = order[nd.cellindex]++;
        }
    }
    delete[] order;

    //  std::sort(nodevec.begin() + ncell, nodevec.end(), contig_comp);
    // Traversal of nodevec: From root to leaves (this is why we compute the tree node order)
    std::sort(nodevec.begin() + ncell, nodevec.end(), interleave_comp);

#if CORENRN_DEBUG
    for (size_t i = 0; i < nodevec.size(); ++i) {
        TNode& nd = *nodevec[i];
        printf("%ld cell=%ld ix=%d\n", i, nd.cellindex, nd.nodeindex);
    }
#endif
}

static void admin1(int ncell,
                   VecTNode& nodevec,
                   int& nwarp,
                   int& nstride,
                   int*& stride,
                   int*& firstnode,
                   int*& lastnode,
                   int*& cellsize) {
    firstnode = (int*) ecalloc_align(ncell, sizeof(int));
    lastnode = (int*) ecalloc_align(ncell, sizeof(int));
    cellsize = (int*) ecalloc_align(ncell, sizeof(int));

    nwarp = (ncell % warpsize == 0) ? (ncell / warpsize) : (ncell / warpsize + 1);

    for (int i = 0; i < ncell; ++i) {
        firstnode[i] = -1;
        lastnode[i] = -1;
        cellsize[i] = 0;
    }

    nstride = 0;
    for (size_t i = ncell; i < nodevec.size(); ++i) {
        TNode& nd = *nodevec[i];
        size_t ci = nd.cellindex;
        if (firstnode[ci] == -1) {
            firstnode[ci] = i;
        }
        lastnode[ci] = i;
        cellsize[ci] += 1;
        if (nstride < cellsize[ci]) {
            nstride = cellsize[ci];
        }
    }

    // this vector is used to move from one compartment to the other (per cell)
    // its length is equal to the cell with the highest number of compartments
    stride = static_cast<int*>(ecalloc_align(nstride + 1, sizeof(int)));
    for (size_t i = ncell; i < nodevec.size(); ++i) {
        TNode& nd = *nodevec[i];
        // compute how many compartments with the same order
        // treenode_order : defined in breadth first fashion (for each cell separately)
        stride[nd.treenode_order - 1] += 1;  // -1 because treenode order includes root
    }
}

// for admin2 we allow the node organisation in warps of (say 4 cores per warp)
// ...............  ideal warp but unbalanced relative to warp with max cycles
// ...............  ncycle = 15, icore [0:4), all strides are 4.
// ...............
// ...............
//
// ..........       unbalanced relative to warp with max cycles
// ..........       ncycle = 10, not all strides the same because
// ..........       of need to avoid occasional race conditions.
//  .  . ..         icore [4:8) only 4 strides of 4
//
// ....................  ncycle = 20, uses only one core in the warp (cable)
//                       icore 8, all ncycle strides are 1

// One thing to be unhappy about is the large stride vector of size about
// number of compartments/warpsize. There are a lot of models where the
// stride for a warp is constant except for one cycle in the warp and that
// is easy to obtain when there are more than warpsize cells per warp.

static size_t stride_length(size_t begin, size_t end, VecTNode& nodevec) {
    // return stride length starting at i. Do not go past j.
    // max stride is warpsize.
    // At this time, only assume vicious parent race conditions matter.
    if (end - begin > warpsize) {
        end = begin + warpsize;
    }
    for (size_t i = begin; i < end; ++i) {
        TNode* nd = nodevec[i];
        nrn_assert(nd->nodevec_index == i);
        size_t diff = dist2child(nd);
        if (i + diff < end) {
            end = i + diff;
        }
    }
    return end - begin;
}

/**
 * \brief Prepare for solve_interleaved2
 *
 * One group of cells per warp.
 *
 * warp[i] has a number of compute cycles (ncycle[i])
 * the index of its first root (rootbegin[i], last rootbegin[nwarp] = ncell)
 * the index of its first node (nodebegin[i], last nodebegin[nwarp] = nnode)
 *
 * Each compute cycle has a stride
 * A stride is how many nodes are processed by a warp in one compute cycle
 * There are nstride strides. nstride is the sum of ncycles of all warps.
 * warp[i] has ncycle[i] strides
 * same as sum of ncycle
 * warp[i] has a stridedispl[i] which is stridedispl[i-1] + ncycle[i].
 * ie. The zeroth cycle of warp[j] works on stride[stridedispl[j]]
 * The value of a stride beginning at node i (node i is computed by core 0 of
 * some warp for some cycle) is determined by stride_length(i, j, nodevec)
 *
 */
static void admin2(int ncell,
                   VecTNode& nodevec,
                   int& nwarp,
                   int& nstride,
                   int*& stridedispl,
                   int*& strides,
                   int*& rootbegin,
                   int*& nodebegin,
                   int*& ncycles) {
    // the number of groups is the number of warps needed
    // ncore is the number of warps * warpsize
    nwarp = nodevec[ncell - 1]->groupindex + 1;

    ncycles = (int*) ecalloc_align(nwarp, sizeof(int));
    stridedispl = (int*) ecalloc_align(nwarp + 1,
                                       sizeof(int));  // running sum of ncycles (start at 0)
    rootbegin = (int*) ecalloc_align(nwarp + 1, sizeof(int));  // index (+1) of first root in warp.
    nodebegin = (int*) ecalloc_align(nwarp + 1, sizeof(int));  // index (+1) of first node in warp.

    // rootbegin and nodebegin are the root index values + 1 of the last of
    // the sequence of constant groupindex
    rootbegin[0] = 0;
    for (size_t i = 0; i < size_t(ncell); ++i) {
        rootbegin[nodevec[i]->groupindex + 1] = i + 1;
    }
    nodebegin[0] = ncell;
    // We start from the leaves and go backwards towards the root
    for (size_t i = size_t(ncell); i < nodevec.size(); ++i) {
        nodebegin[nodevec[i]->groupindex + 1] = i + 1;
    }

    // ncycles, stridedispl, and nstride
    nstride = 0;
    stridedispl[0] = 0;
    for (size_t iwarp = 0; iwarp < (size_t) nwarp; ++iwarp) {
        size_t j = size_t(nodebegin[iwarp + 1]);
        int nc = 0;
        size_t i = nodebegin[iwarp];
        // in this loop we traverse all the children of all the cells in the current warp (iwarp)
        while (i < j) {
            i += stride_length(i, j, nodevec);
            ++nc;  // how many times the warp should loop in order to finish with all the tree
                   // depths (for all the trees of the warp/group)
        }
        ncycles[iwarp] = nc;
        stridedispl[iwarp + 1] = stridedispl[iwarp] + nc;
        nstride += nc;
    }

    // strides
    strides = (int*) ecalloc_align(nstride, sizeof(int));
    nstride = 0;
    for (size_t iwarp = 0; iwarp < (size_t) nwarp; ++iwarp) {
        size_t j = size_t(nodebegin[iwarp + 1]);
        size_t i = nodebegin[iwarp];
        while (i < j) {
            int k = stride_length(i, j, nodevec);
            i += k;
            strides[nstride++] = k;
        }
    }

#if CORENRN_DEBUG
    printf("warp rootbegin nodebegin stridedispl\n");
    for (int i = 0; i <= nwarp; ++i) {
        printf("%4d %4d %4d %4d\n", i, rootbegin[i], nodebegin[i], stridedispl[i]);
    }
#endif
}
}  // namespace coreneuron


================================================
FILE: coreneuron/permute/cellorder2.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#include <cstdio>
#include <map>
#include <set>
#include <algorithm>
#include <cstring>
#include <numeric>

#include "coreneuron/utils/nrn_assert.h"
#include "coreneuron/permute/cellorder.hpp"
#include "coreneuron/network/tnode.hpp"
#include "coreneuron/nrniv/nrniv_decl.h"

// experiment starting with identical cell ordering
// groupindex aleady defined that keeps identical cells together
// begin with leaf to root ordering
namespace coreneuron {
using VTN = VecTNode;             // level of nodes
using VVTN = std::vector<VTN>;    // group of levels
using VVVTN = std::vector<VVTN>;  // groups

// verify level in groups of nident identical nodes
void chklevel(VTN& level, size_t nident = 8) {}

// first child before second child, etc
// if same parent level, then parent order
// if not same parent, then earlier parent (no parent earlier than parent)
// if same parents, then children order
// if no parents then nodevec_index order.
static bool sortlevel_cmp(TNode* a, TNode* b) {
    // when starting with leaf to root order
    // note that leaves are at max level and all roots at level 0
    bool result = false;
    // since cannot have an index < 0, just add 1 to level
    size_t palevel = a->parent ? 1 + a->parent->level : 0;
    size_t pblevel = b->parent ? 1 + b->parent->level : 0;
    if (palevel < pblevel) {          // only used when starting leaf to root order
        result = true;                // earlier level first
    } else if (palevel == pblevel) {  // always true when starting root to leaf
        if (palevel == 0) {           // a and b are roots
            if (a->nodevec_index < b->nodevec_index) {
                result = true;
            }
        } else {  // parent order (already sorted with proper treenode_order)
            if (a->treenode_order < b->treenode_order) {  // children order
                result = true;
            } else if (a->treenode_order == b->treenode_order) {
                if (a->parent->treenode_order < b->parent->treenode_order) {
                    result = true;
                }
            }
        }
    }
    return result;
}

static void sortlevel(VTN& level) {
    std::sort(level.begin(), level.end(), sortlevel_cmp);

    for (size_t i = 0; i < level.size(); ++i) {
        level[i]->treenode_order = i;
    }
}

// TODO: refactor since sortlevel() is traversing the nodes in same order
static void set_treenode_order(VVTN& levels) {
    size_t order = 0;
    for (auto& level: levels) {
        for (auto* nd: level) {
            nd->treenode_order = order++;
        }
    }
}

#if CORENRN_DEBUG
// every level starts out with no race conditions involving both
// parent and child in the same level. Can we arrange things so that
// every level has at least 32 nodes?
static size_t g32(TNode* nd) {
    return nd->nodevec_index / warpsize;
}

static bool is_parent_race(TNode* nd) {  // vitiating
    size_t pg = g32(nd);
    for (const auto& child: nd->children) {
        if (pg == g32(child)) {
            return true;
        }
    }
    return false;
}
#endif

// less than 32 apart
static bool is_parent_race2(TNode* nd) {  // vitiating
    size_t pi = nd->nodevec_index;
    for (const auto& child: nd->children) {
        if (child->nodevec_index - pi < warpsize) {
            return true;
        }
    }
    return false;
}

#if CORENRN_DEBUG
static bool is_child_race(TNode* nd) {  // potentially handleable by atomic
    if (nd->children.size() < 2) {
        return false;
    }
    if (nd->children.size() == 2) {
        return g32(nd->children[0]) == g32(nd->children[1]);
    }
    std::set<size_t> s;
    for (const auto& child: nd->children) {
        std::size_t gc = g32(child);
        if (s.find(gc) != s.end()) {
            return true;
        }
        s.insert(gc);
    }
    return false;
}
#endif

static bool is_child_race2(TNode* nd) {  // potentially handleable by atomic
    if (nd->children.size() < 2) {
        return false;
    }
    if (nd->children.size() == 2) {
        size_t c0 = nd->children[0]->nodevec_index;
        size_t c1 = nd->children[1]->nodevec_index;
        c0 = (c0 < c1) ? (c1 - c0) : (c0 - c1);
        return c0 < warpsize;
    }
    size_t ic0 = nd->children[0]->nodevec_index;
    for (size_t i = 1; i < nd->children.size(); ++i) {
        size_t ic = nd->children[i]->nodevec_index;
        if (ic - ic0 < warpsize) {
            return true;
        }
        ic0 = ic;
    }
    return false;
}

size_t dist2child(TNode* nd) {
    size_t d = 1000;
    size_t pi = nd->nodevec_index;
    for (const auto& child: nd->children) {
        std::size_t d1 = child->nodevec_index - pi;
        if (d1 < d) {
            d = d1;
        }
    }
    return d;
}

// from stackoverflow.com
template <typename T>
static void move_range(size_t start, size_t length, size_t dst, std::vector<T>& v) {
    typename std::vector<T>::iterator first, middle, last;
    if (start < dst) {
        first = v.begin() + start;
        middle = first + length;
        last = v.begin() + dst;
    } else {
        first = v.begin() + dst;
        middle = v.begin() + start;
        last = middle + length;
    }
    std::rotate(first, middle, last);
}

static void move_nodes(size_t start, size_t length, size_t dst, VTN& nodes) {
    nrn_assert(dst <= nodes.size());
    nrn_assert(start + length <= dst);
    move_range(start, length, dst, nodes);

    // check correctness of move
    for (size_t i = start; i < dst - length; ++i) {
        nrn_assert(nodes[i]->nodevec_index == i + length);
    }
    for (size_t i = dst - length; i < dst; ++i) {
        nrn_assert(nodes[i]->nodevec_index == start + (i - (dst - length)));
    }

    // update nodevec_index
    for (size_t i = start; i < dst; ++i) {
        nodes[i]->nodevec_index = i;
    }
}

#if CORENRN_DEBUG
// least number of nodes to move after nd to eliminate prace
static size_t need2move(TNode* nd) {
    size_t d = dist2child(nd);
    return warpsize - ((nd->nodevec_index % warpsize) + d);
}

static void how_many_warpsize_groups_have_only_leaves(VTN& nodes) {
    size_t n = 0;
    for (size_t i = 0; i < nodes.size(); i += warpsize) {
        bool r = true;
        for (size_t j = 0; j < warpsize; ++j) {
            if (!nodes[i + j]->children.empty()) {
                r = false;
                break;
            }
        }
        if (r) {
            printf("warpsize group %ld starting at level %ld\n", i / warpsize, nodes[i]->level);
            ++n;
        }
    }
    printf("number of warpsize groups with only leaves = %ld\n", n);
}

static void pr_race_situation(VTN& nodes) {
    size_t prace2 = 0;
    size_t prace = 0;
    size_t crace = 0;
    for (size_t i = nodes.size() - 1; nodes[i]->level != 0; --i) {
        TNode* nd = nodes[i];
        if (is_parent_race2(nd)) {
            ++prace2;
        }
        if (is_parent_race(nd)) {
            printf("level=%ld i=%ld d=%ld n=%ld",
                   nd->level,
                   nd->nodevec_index,
                   dist2child(nd),
                   need2move(nd));
            for (const auto& cnd: nd->children) {
                printf("   %ld %ld", cnd->level, cnd->nodevec_index);
            }
            printf("\n");
            ++prace;
        }
        if (is_child_race(nd)) {
            ++crace;
        }
    }
    printf("prace=%ld  crace=%ld prace2=%ld\n", prace, crace, prace2);
}
#endif

static size_t next_leaf(TNode* nd, VTN& nodes) {
    size_t i = 0;
    for (i = nd->nodevec_index - 1; i > 0; --i) {
        if (nodes[i]->children.empty()) {
            return i;
        }
    }
    //  nrn_assert(i > 0);
    return 0;
}

static void checkrace(TNode* nd, VTN& nodes) {
    for (size_t i = nd->nodevec_index; i < nodes.size(); ++i) {
        if (is_parent_race2(nodes[i])) {
            //      printf("checkrace %ld\n", i);
        }
    }
}

static bool eliminate_race(TNode* nd, size_t d, VTN& nodes, TNode* look) {
    // printf("eliminate_race %ld %ld\n", nd->nodevec_index, d);
    // opportunistically move that number of leaves
    // error if no leaves left to move.
    size_t i = look->nodevec_index;
    while (d > 0) {
        i = next_leaf(nodes[i], nodes);
        if (i == 0) {
            return false;
        }
        size_t n = 1;
        while (nodes[i - 1]->children.empty() && n < d) {
            --i;
            ++n;
        }
        // printf("  move_nodes src=%ld len=%ld dest=%ld\n", i, n, nd->nodevec_index);
        move_nodes(i, n, nd->nodevec_index + 1, nodes);
        d -= n;
    }
    checkrace(nd, nodes);
    return true;
}

static void eliminate_prace(TNode* nd, VTN& nodes) {
    size_t d = warpsize - dist2child(nd);
    bool b = eliminate_race(nd, d, nodes, nd);
    if (0 && !b) {
        printf("could not eliminate prace for g=%ld  c=%ld l=%ld o=%ld   %ld\n",
               nd->groupindex,
               nd->cellindex,
               nd->level,
               nd->treenode_order,
               nd->hash);
    }
}

static void eliminate_crace(TNode* nd, VTN& nodes) {
    size_t c0 = nd->children[0]->nodevec_index;
    size_t c1 = nd->children[1]->nodevec_index;
    size_t d = warpsize - ((c0 > c1) ? (c0 - c1) : (c1 - c0));
    TNode* cnd = nd->children[0];
    bool b = eliminate_race(cnd, d, nodes, nd);
    if (0 && !b) {
        printf("could not eliminate crace for g=%ld  c=%ld l=%ld o=%ld   %ld\n",
               nd->groupindex,
               nd->cellindex,
               nd->level,
               nd->treenode_order,
               nd->hash);
    }
}

static void question2(VVTN& levels) {
    // number of compartments in the group
    std::size_t nnode = std::accumulate(levels.begin(),
                                        levels.end(),
                                        0,
                                        [](std::size_t s, const VTN& l) { return s + l.size(); });
    VTN nodes(nnode);  // store the sorted nodes from analyze function
    nnode = 0;
    for (const auto& level: levels) {
        for (const auto& l: level) {
            nodes[nnode++] = l;
        }
    }
    for (size_t i = 0; i < nodes.size(); ++i) {
        nodes[i]->nodevec_index = i;
    }

    //  how_many_warpsize_groups_have_only_leaves(nodes);

    // Here we need to make sure that the dependent nodes
    // belong to separate warps

    // work backward and check the distance from parent to children.
    // if parent in different group (warp?) then there is no vitiating race.
    // if children in different group (warp?) then ther is no race (satisfied by
    // atomic).
    // If there is a vitiating race, then figure out how many nodes
    // need to be inserted just before the parent to avoid the race.
    //   It is not clear if we should prioritize safe nodes (when moved they
    //   do not introduce a race) and/or contiguous nodes (probably, to keep
    //   the low hanging fruit together).
    //   At least, moved nodes should have proper tree order and not themselves
    //   introduce a race at their new location.  Leaves are nice in that there
    //   are no restrictions in movement toward higher indices.
    //   Note that unless groups of 32 are inserted, it may be the case that
    //   races are generated at greater indices since otherwise a portion of
    //   each group is placed into the next group. This would not be an issue
    //   if, in fact, the stronger requirement of every parent having
    //   pi (parent index) + 32 <= ci (child index) is demanded instead of merely being in different
    //   warpsize. One nice thing about adding warpsize nodes is that it does not disturb any
    //   existing contiguous groups except the moved group which gets divided between parent
    //   warpsize and child, where the nodes past the parent get same relative indices in the next
    //   warpsize

    //  let's see how well we can do by opportunistically moving leaves to
    //  separate parents from children by warpsize (ie is_parent_prace2 is false)
    //  Hopefully, we won't run out of leaves before eliminating all
    //  is_parent_prace2

    if (0 && nodes.size() % warpsize != 0) {
        size_t nnode = nodes.size() - levels[0].size();
        printf("warp of %ld cells has %ld nodes in last cycle %ld\n",
               levels[0].size(),
               nnode % warpsize,
               nnode / warpsize + 1);
    }

    //  pr_race_situation(nodes);

    // eliminate parent and children races using leaves
    // traverse all the children (no roots)
    for (size_t i = nodes.size() - 1; i >= levels[0].size(); --i) {
        TNode* nd = nodes[i];
        if (is_child_race2(nd)) {
            eliminate_crace(nd, nodes);
            i = nd->nodevec_index;
        }
        if (is_parent_race2(nd)) {
            eliminate_prace(nd, nodes);
            i = nd->nodevec_index;
        }
    }
    // copy nodes indices to treenode_order
    for (size_t i = 0; i < nodes.size(); ++i) {
        nodes[i]->treenode_order = i;
    }
}

// analyze each group of cells
// the cells are grouped based on warp balance (lpt) algorithm
static void analyze(VVTN& levels) {
    // sort each level with respect to parent level order
    // earliest parent level first.

    // treenode order can be anything as long as first children < second
    // children etc.. After sorting a level, the order will be correct for
    // that level, ranging from [0:level.size]
    for (auto& level: levels) {
        chklevel(level);  // does nothing
        for (const auto& nd: level) {
            for (size_t k = 0; k < nd->children.size(); ++k) {
                nd->children[k]->treenode_order = k;
            }
        }
    }

    for (auto& level: levels) {
        sortlevel(level);
        chklevel(level);  // does nothing
    }

    set_treenode_order(levels);
}

void prgroupsize(VVVTN& groups) {
#if CORENRN_DEBUG
    for (size_t i = 0; i < groups[0].size(); ++i) {
        printf("%5ld\n", i);
        for (const auto& group: groups) {
            printf(" %5ld", group[i].size());
        }
        printf("\n");
    }
#endif
}

// group index primary, treenode_order secondary
static bool final_nodevec_cmp(TNode* a, TNode* b) {
    bool result = false;
    if (a->groupindex < b->groupindex) {
        result = true;
    } else if (a->groupindex == b->groupindex) {
        if (a->treenode_order < b->treenode_order) {
            result = true;
        }
    }
    return result;
}

static void set_nodeindex(VecTNode& nodevec) {
    for (size_t i = 0; i < nodevec.size(); ++i) {
        nodevec[i]->nodevec_index = i;
    }
}

void group_order2(VecTNode& nodevec, size_t groupsize, size_t ncell) {
    size_t maxlevel = level_from_root(nodevec);

    // reset TNode.groupindex
    size_t nwarp = warp_balance(ncell, nodevec);

    // work on a cellgroup as a vector of levels. ie only possible race is
    // two children in same warpsize

    // every warp deals with a group of cells
    // the cell dispatching to the available groups is done through the warp_balance function (lpt
    // algo)
    VVVTN groups(nwarp ? nwarp : (ncell / groupsize + ((ncell % groupsize) ? 1 : 0)));

    for (auto& group: groups) {
        group.resize(maxlevel + 1);
    }

    // group the cells according to their groupindex and according to their level (see
    // level_from_root)
    for (const auto& nd: nodevec) {
        groups[nd->groupindex][nd->level].push_back(nd);
    }

    prgroupsize(groups);  // debugging

    // deal with each group
    for (auto& group: groups) {
        analyze(group);
        question2(group);
    }

    // final nodevec order according to group_index and treenode_order
    std::sort(nodevec.begin() + ncell, nodevec.end(), final_nodevec_cmp);
    set_nodeindex(nodevec);
}
}  // namespace coreneuron


================================================
FILE: coreneuron/permute/data_layout.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#include "coreneuron/coreneuron.hpp"
#include "coreneuron/permute/data_layout.hpp"
#include "coreneuron/mechanism/mechanism.hpp"
#include "coreneuron/permute/node_permute.h"
#include "coreneuron/mechanism/membfunc.hpp"

namespace coreneuron {
/*
 * Return the index to mechanism variable based Original input files are organized in AoS
 */
int get_data_index(int node_index, int variable_index, int mtype, Memb_list* ml) {
    int layout = corenrn.get_mech_data_layout()[mtype];
    nrn_assert(layout == SOA_LAYOUT);
    return variable_index * ml->_nodecount_padded + node_index;
}
}  // namespace coreneuron


================================================
FILE: coreneuron/permute/data_layout.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#pragma once

#define SOA_LAYOUT 0
#define AOS_LAYOUT 1
namespace coreneuron {
struct Memb_list;
int get_data_index(int node_index, int variable_index, int mtype, Memb_list* ml);
}  // namespace coreneuron


================================================
FILE: coreneuron/permute/node_permute.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

/*
Below, the sense of permutation, is reversed. Though consistent, forward
permutation should be defined as (and the code should eventually transformed)
so that
  v: original vector
  p: forward permutation
  pv: permuted vector
  pv[i] = v[p[i]]
and
  pinv: inverse permutation
  pv[pinv[i]] = v[i]
Note: pinv[p[i]] = i = p[pinv[i]]
*/

/*
Permute nodes.

To make gaussian elimination on gpu more efficient.

Permutation vector p[i] applied to a data vector, moves the data_original[i]
to data[p[i]].
That suffices for node properties such as area[i], a[i], b[i]. e.g.
  area[p[i]] <- area_original[i]

Notice that p on the left side is a forward permutation. On the right side
it serves as the inverse permutation.
area_original[i] <- area_permuted[p[i]]

but things
get a bit more complicated when the data is an integer index into the
original data.

For example:

parent[i] needs to be transformed so that
parent[p[i]] <- p[parent_original[i]] except that if parent_original[j] = -1
  then parent[p[j]] = -1

membrane mechanism nodelist ( a subset of nodes) needs to be at least
minimally transformed so that
nodelist_new[k] <- p[nodelist_original[k]]
This does not affect the order of the membrane mechanism property data.

However, computation is more efficient to permute (sort) nodelist_new so that
it follows as much as possible the permuted node ordering, ie in increasing
node order.  Consider this further mechanism specific nodelist permutation,
which is to be applied to the above nodelist_new, to be p_m, which has the same
size as nodelist. ie.
nodelist[p_m[k]] <- nodelist_new[k].

Notice the similarity to the parent case...
nodelist[p_m[k]] = p[nodelist_original[k]]

and now the membrane mechanism node data, does need to be permuted to have an
order consistent with the new nodelist. Since there are nm instances of the
mechanism each with sz data values (consider AoS layout).
The data permutation is
for k=[0:nm] for isz=[0:sz]
  data_m[p_m[k]*sz + isz] = data_m_original[k*sz + isz]

For an SoA layout the indexing is k + isz*nm (where nm may include padding).

A more complicated case is a mechanisms dparam array (nm instances each with
dsz values) Some of those values are indices into another mechanism (eg
pointers to ion properties) or voltage or area depending on the semantics of
the value. We can use the above data_m permutation but then need to update
the values according to the permutation of the object the value indexes into.
Consider the permutation of the target object to be p_t . Then a value
iold = pdata_m(k, isz) - data_t in AoS format
refers to k_t = iold % sz_t and isz_t = iold - k_t*sz_t
and for a target in SoA format isz_t = iold % nm_t and k_t = iold - isz_t*nm_t
ie k_t_new = p_m_t[k_t] so, for AoS, inew = k_t_new*sz_t + isz_t
or , for SoA, inew = k_t_new + isz_t*nm_t
so pdata_m(k, isz) = inew + data_t


*/

#include <vector>
#include <utility>
#include <algorithm>

#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/io/nrn_setup.hpp"
#include "coreneuron/nrniv/nrniv_decl.h"
#include "coreneuron/utils/nrn_assert.h"
#include "coreneuron/coreneuron.hpp"
namespace coreneuron {
template <typename T>
void permute(T* data, int cnt, int sz, int layout, int* p) {
    // data(p[icnt], isz) <- data(icnt, isz)
    // this does not change data, merely permutes it.
    // assert len(p) == cnt
    if (!p) {
        return;
    }
    int n = cnt * sz;
    if (n < 1) {
        return;
    }

    if (layout == Layout::SoA) {  // for SoA, n might be larger due to cnt padding
        n = nrn_soa_padded_size(cnt, layout) * sz;
    }

    T* data_orig = new T[n];
    for (int i = 0; i < n; ++i) {
        data_orig[i] = data[i];
    }

    for (int icnt = 0; icnt < cnt; ++icnt) {
        for (int isz = 0; isz < sz; ++isz) {
            // note that when layout==0, nrn_i_layout takes into account SoA padding.
            int i = nrn_i_layout(icnt, cnt, isz, sz, layout);
            int ip = nrn_i_layout(p[icnt], cnt, isz, sz, layout);
            data[ip] = data_orig[i];
        }
    }

    delete[] data_orig;
}

int* inverse_permute(int* p, int n) {
    int* pinv = new int[n];
    for (int i = 0; i < n; ++i) {
        pinv[p[i]] = i;
    }
    return pinv;
}

static void invert_permute(int* p, int n) {
    int* pinv = inverse_permute(p, n);
    for (int i = 0; i < n; ++i) {
        p[i] = pinv[i];
    }
    delete[] pinv;
}

// type_of_ntdata: Return the mechanism type (or voltage)  for nt._data[i].
// Used for updating POINTER. Analogous to nrn_dblpntr2nrncore in NEURON.
// To reduce search time, consider voltage first, then a few of the previous
// search results.
// type_hint first and store a few
// of the previous search result types to try next.
// Most usage is for voltage. Most of the rest is likely for a specific type.
// Occasionally, eg. axial current, there are two types oscillationg between
// a SUFFIX (for non-zero area node) and POINT_PROCESS (for zero area nodes)
// version
// full_search: helper for type_of_ntdata. Return mech type for nt._data[i].
// Update type_hints.

static std::vector<int> type_hints;

static int full_search(NrnThread& nt, double* pd) {
    int type = -1;
    for (NrnThreadMembList* tml = nt.tml; tml; tml = tml->next) {
        Memb_list* ml = tml->ml;
        int n = corenrn.get_prop_param_size()[tml->index] * ml->_nodecount_padded;
        if (pd >= ml->data && pd < ml->data + n) {
            type = tml->index;
            // insert into type_hints
            int i = 0;
            for (int type_hint: type_hints) {
                if (type < type_hint) {
                    break;
                }
                i++;
            }
            type_hints.insert(type_hints.begin() + i, type);
            break;
        }
    }
    assert(type > 0);
    return type;
}

// no longer static because also used by POINTER in nrn_checkpoint.cpp
int type_of_ntdata(NrnThread& nt, int i, bool reset) {
    double* pd = nt._data + i;
    assert(pd >= nt._actual_v);
    if (pd < nt._actual_area) {  // voltage first (area just after voltage)
        return voltage;
    }
    assert(size_t(i) < nt._ndata);
    // then check the type hints. When inserting a hint, keep in type order
    if (reset) {
        type_hints.clear();
    }
    for (int type: type_hints) {
        Memb_list* ml = nt._ml_list[type];
        if (pd >= ml->data) {  // this or later
            int n = corenrn.get_prop_param_size()[type] * ml->_nodecount_padded;
            if (pd < ml->data + n) {  // this is the one
                return type;
            }
        } else {  // earlier
            return full_search(nt, pd);
        }
    }
    // after the last type_hints
    return full_search(nt, pd);
}

static void update_pdata_values(Memb_list* ml, int type, NrnThread& nt) {
    // assumes AoS to SoA transformation already made since we are using
    // nrn_i_layout to determine indices into both ml->pdata and into target data
    int psz = corenrn.get_prop_dparam_size()[type];
    if (psz == 0) {
        return;
    }
    if (corenrn.get_is_artificial()[type]) {
        return;
    }
    int* semantics = corenrn.get_memb_func(type).dparam_semantics;
    if (!semantics) {
        return;
    }
    int* pdata = ml->pdata;
    int layout = corenrn.get_mech_data_layout()[type];
    int cnt = ml->nodecount;
    // ml padding does not matter (but target padding does matter)

    // interesting semantics are -1 (area), -5 (pointer), -9 (diam), or 0-999 (ion variables)
    for (int i = 0; i < psz; ++i) {
        int s = semantics[i];
        if (s == -1) {                               // area
            int area0 = nt._actual_area - nt._data;  // includes padding if relevant
            int* p_target = nt._permute;
            for (int iml = 0; iml < cnt; ++iml) {
                int* pd = pdata + nrn_i_layout(iml, cnt, i, psz, layout);
                // *pd is the original integer into nt._data . Needs to be replaced
                // by the permuted value

                // This is ok whether or not area changed by padding?
                // since old *pd updated appropriately by earlier AoS to SoA
                // transformation
                int ix = *pd - area0;  // original integer into area array.
                nrn_assert((ix >= 0) && (ix < nt.end));
                int ixnew = p_target[ix];
                *pd = ixnew + area0;
            }
        } else if (s == -9) {                        // diam
            int diam0 = nt._actual_diam - nt._data;  // includes padding if relevant
            int* p_target = nt._permute;
            for (int iml = 0; iml < cnt; ++iml) {
                int* pd = pdata + nrn_i_layout(iml, cnt, i, psz, layout);
                // *pd is the original integer into nt._data . Needs to be replaced
                // by the permuted value

                // This is ok whether or not diam changed by padding?
                // since old *pd updated appropriately by earlier AoS to SoA
                // transformation
                int ix = *pd - diam0;  // original integer into actual_diam array.
                nrn_assert((ix >= 0) && (ix < nt.end));
                int ixnew = p_target[ix];
                *pd = ixnew + diam0;
            }
        } else if (s == -5) {  // POINTER
            // assume pointer into nt._data. Most likely voltage.
            // If not voltage, most likely same mechanism for all indices.
            for (int iml = 0; iml < cnt; ++iml) {
                int* pd = pdata + nrn_i_layout(iml, cnt, i, psz, layout);
                int etype = type_of_ntdata(nt, *pd, iml == 0);
                if (etype == voltage) {
                    int v0 = nt._actual_v - nt._data;
                    int* e_target = nt._permute;
                    int ix = *pd - v0;  // original integer into area array.
                    nrn_assert((ix >= 0) && (ix < nt.end));
                    int ixnew = e_target[ix];
                    *pd = ixnew + v0;
                } else if (etype > 0) {
                    // about same as for ion below but check each instance
                    Memb_list* eml = nt._ml_list[etype];
                    int edata0 = eml->data - nt._data;
                    int ecnt = eml->nodecount;
                    int esz = corenrn.get_prop_param_size()[etype];
                    int elayout = corenrn.get_mech_data_layout()[etype];
                    int* e_permute = eml->_permute;
                    int i_ecnt, i_esz, padded_ecnt;
                    int ix = *pd - edata0;
                    if (elayout == Layout::AoS) {
                        padded_ecnt = ecnt;
                        i_ecnt = ix / esz;
                        i_esz = ix % esz;
                    } else {  // SoA
                        assert(elayout == Layout::SoA);
                        padded_ecnt = nrn_soa_padded_size(ecnt, elayout);
                        i_ecnt = ix % padded_ecnt;
                        i_esz = ix / padded_ecnt;
                    }
                    int i_ecnt_new = e_permute ? e_permute[i_ecnt] : i_ecnt;
                    int ix_new = nrn_i_layout(i_ecnt_new, ecnt, i_esz, esz, elayout);
                    *pd = ix_new + edata0;
                } else {
                    nrn_assert(0);
                }
            }
        } else if (s >= 0 && s < 1000) {  // ion
            int etype = s;
            int elayout = corenrn.get_mech_data_layout()[etype];
            Memb_list* eml = nt._ml_list[etype];
            int edata0 = eml->data - nt._data;
            int ecnt = eml->nodecount;
            int esz = corenrn.get_prop_param_size()[etype];
            int* e_permute = eml->_permute;
            for (int iml = 0; iml < cnt; ++iml) {
                int* pd = pdata + nrn_i_layout(iml, cnt, i, psz, layout);
                int ix = *pd - edata0;
                // from ix determine i_ecnt and i_esz (need to permute i_ecnt)
                int i_ecnt, i_esz, padded_ecnt;
                if (elayout == Layout::AoS) {
                    padded_ecnt = ecnt;
                    i_ecnt = ix / esz;
                    i_esz = ix % esz;
                } else {  // SoA
                    assert(elayout == Layout::SoA);
                    padded_ecnt = nrn_soa_padded_size(ecnt, elayout);
                    i_ecnt = ix % padded_ecnt;
                    i_esz = ix / padded_ecnt;
                }
                int i_ecnt_new = e_permute[i_ecnt];
                int ix_new = nrn_i_layout(i_ecnt_new, ecnt, i_esz, esz, elayout);
                *pd = ix_new + edata0;
            }
        }
    }
}

void node_permute(int* vec, int n, int* permute) {
    for (int i = 0; i < n; ++i) {
        if (vec[i] >= 0) {
            vec[i] = permute[vec[i]];
        }
    }
}

void permute_ptr(int* vec, int n, int* p) {
    permute(vec, n, 1, 1, p);
}

void permute_data(double* vec, int n, int* p) {
    permute(vec, n, 1, 1, p);
}

void permute_ml(Memb_list* ml, int type, NrnThread& nt) {
    int sz = corenrn.get_prop_param_size()[type];
    int psz = corenrn.get_prop_dparam_size()[type];
    int layout = corenrn.get_mech_data_layout()[type];
    permute(ml->data, ml->nodecount, sz, layout, ml->_permute);
    permute(ml->pdata, ml->nodecount, psz, layout, ml->_permute);

    update_pdata_values(ml, type, nt);
}

int nrn_index_permute(int ix, int type, Memb_list* ml) {
    int* p = ml->_permute;
    if (!p) {
        return ix;
    }
    int layout = corenrn.get_mech_data_layout()[type];
    if (layout == Layout::AoS) {
        int sz = corenrn.get_prop_param_size()[type];
        int i_cnt = ix / sz;
        int i_sz = ix % sz;
        return p[i_cnt] * sz + i_sz;
    } else {
        assert(layout == Layout::SoA);
        int padded_cnt = nrn_soa_padded_size(ml->nodecount, layout);
        int i_cnt = ix % padded_cnt;
        int i_sz = ix / padded_cnt;
        return i_sz * padded_cnt + p[i_cnt];
    }
}

#if CORENRN_DEBUG
static void pr(const char* s, int* x, int n) {
    printf("%s:", s);
    for (int i = 0; i < n; ++i) {
        printf("  %d %d", i, x[i]);
    }
    printf("\n");
}

static void pr(const char* s, double* x, int n) {
    printf("%s:", s);
    for (int i = 0; i < n; ++i) {
        printf("  %d %g", i, x[i]);
    }
    printf("\n");
}
#endif

// note that sort_indices has the sense of an inverse permutation in that
// the value of sort_indices[0] is the index with the smallest value in the
// indices array

static bool nrn_index_sort_cmp(const std::pair<int, int>& a, const std::pair<int, int>& b) {
    bool result = false;
    if (a.first < b.first) {
        result = true;
    } else if (a.first == b.first) {
        if (a.second < b.second) {
            result = true;
        }
    }
    return result;
}

static int* nrn_index_sort(int* values, int n) {
    std::vector<std::pair<int, int>> vi(n);
    for (int i = 0; i < n; ++i) {
        vi[i].first = values[i];
        vi[i].second = i;
    }
    std::sort(vi.begin(), vi.end(), nrn_index_sort_cmp);
    int* sort_indices = new int[n];
    for (int i = 0; i < n; ++i) {
        sort_indices[i] = vi[i].second;
    }
    return sort_indices;
}

void permute_nodeindices(Memb_list* ml, int* p) {
    // nodeindices values are permuted according to p (that per se does
    //  not affect vec).

    node_permute(ml->nodeindices, ml->nodecount, p);

    // Then the new node indices are sorted by
    // increasing index. Instances using the same node stay in same
    // original relative order so that their contributions to rhs, d (if any)
    // remain in same order (except for gpu parallelism).
    // That becomes ml->_permute

    ml->_permute = nrn_index_sort(ml->nodeindices, ml->nodecount);
    invert_permute(ml->_permute, ml->nodecount);
    permute_ptr(ml->nodeindices, ml->nodecount, ml->_permute);
}
}  // namespace coreneuron


================================================
FILE: coreneuron/permute/node_permute.h
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#pragma once

#include "coreneuron/sim/multicore.hpp"

namespace coreneuron {
// determine ml->_permute and permute the ml->nodeindices accordingly
void permute_nodeindices(Memb_list* ml, int* permute);

// vec values >= 0 updated according to permutation
void node_permute(int* vec, int n, int* permute);

// moves values to new location but does not change those values
void permute_ptr(int* vec, int n, int* permute);

void permute_data(double* vec, int n, int* permute);
void permute_ml(Memb_list* ml, int type, NrnThread& nt);
int nrn_index_permute(int, int type, Memb_list* ml);

int* inverse_permute(int* p, int n);

int type_of_ntdata(NrnThread&, int index, bool reset);
}  // namespace coreneuron


================================================
FILE: coreneuron/sim/fadvance_core.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#include <functional>

#include "coreneuron/coreneuron.hpp"
#include "coreneuron/nrnconf.h"
#include "coreneuron/apps/corenrn_parameters.hpp"
#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/mpi/nrnmpi.h"
#include "coreneuron/sim/fast_imem.hpp"
#include "coreneuron/gpu/nrn_acc_manager.hpp"
#include "coreneuron/io/reports/nrnreport.hpp"
#include "coreneuron/network/netcvode.hpp"
#include "coreneuron/network/netpar.hpp"
#include "coreneuron/network/partrans.hpp"
#include "coreneuron/utils/nrnoc_aux.hpp"
#include "coreneuron/utils/progressbar/progressbar.hpp"
#include "coreneuron/utils/profile/profiler_interface.h"
#include "coreneuron/io/nrn2core_direct.h"

namespace coreneuron {
static void* nrn_fixed_step_thread(NrnThread*);
static void nrn_fixed_step_group_thread(NrnThread*, int, int, int&);


namespace {

class ProgressBar final {
    progressbar* pbar;
    int current_step = 0;
    bool show;
    constexpr static int progressbar_update_steps = 5;

  public:
    ProgressBar(int nsteps)
        : show(nrnmpi_myid == 0 && !corenrn_param.is_quiet()) {
        if (show) {
            printf("\n");
            pbar = progressbar_new("psolve", nsteps);
        }
    }

    void update(int step, double time) {
        current_step = step;
        if (show && (current_step % progressbar_update_steps) == 0) {
            progressbar_update(pbar, current_step, time);
        }
    }

    void step(double time) {
        update(current_step + 1, time);
    }

    ~ProgressBar() {
        if (show) {
            progressbar_finish(pbar);
        }
    }
};

}  // unnamed namespace


void dt2thread(double adt) { /* copied from nrnoc/fadvance.c */
    if (adt != nrn_threads[0]._dt) {
        for (int i = 0; i < nrn_nthread; ++i) {
            NrnThread* nt = nrn_threads + i;
            nt->_t = t;
            nt->_dt = dt;
            if (secondorder) {
                nt->cj = 2.0 / dt;
            } else {
                nt->cj = 1.0 / dt;
            }
            nrn_pragma_acc(update device(nt->_t, nt->_dt, nt->cj)
                               async(nt->stream_id) if (nt->compute_gpu))
            // clang-format off
            nrn_pragma_omp(target update to(nt->_t, nt->_dt, nt->cj)
                                         if(nt->compute_gpu))
            // clang-format on
        }
    }
}

void nrn_fixed_step_minimal() { /* not so minimal anymore with gap junctions */
    Instrumentor::phase p_timestep("timestep");
    if (t != nrn_threads->_t) {
        dt2thread(-1.);
    } else {
        dt2thread(dt);
    }
    nrn_thread_table_check();
    nrn_multithread_job(nrn_fixed_step_thread);
    if (nrn_have_gaps) {
        {
            Instrumentor::phase p_gap("gap-v-transfer");
            nrnmpi_v_transfer();
        }
        nrn_multithread_job(nrn_fixed_step_lastpart);
    }
#if NRNMPI
    if (nrn_threads[0]._stop_stepping) {
        nrn_spike_exchange(nrn_threads);
    }
#endif

#if defined(ENABLE_BIN_REPORTS) || defined(ENABLE_SONATA_REPORTS)
    {
        Instrumentor::phase p("flush_reports");
        nrn_flush_reports(nrn_threads[0]._t);
    }
#endif
    t = nrn_threads[0]._t;
}

/* better cache efficiency since a thread can do an entire minimum delay
integration interval before joining
*/
/// --> Coreneuron


void nrn_fixed_single_steps_minimal(int total_sim_steps, double tstop) {
    ProgressBar progress_bar(total_sim_steps);
#if NRNMPI
    double updated_tstop = tstop - dt;
    nrn_assert(nrn_threads->_t <= tstop);
    // It may very well be the case that we do not advance at all
    while (nrn_threads->_t <= updated_tstop) {
#else
    double updated_tstop = tstop - .5 * dt;
    while (nrn_threads->_t < updated_tstop) {
#endif
        nrn_fixed_step_minimal();
        if (stoprun) {
            break;
        }
        progress_bar.step(nrn_threads[0]._t);
    }
}


void nrn_fixed_step_group_minimal(int total_sim_steps) {
    dt2thread(dt);
    nrn_thread_table_check();
    int step_group_n = total_sim_steps;
    int step_group_begin = 0;
    int step_group_end = 0;

    ProgressBar progress_bar(step_group_n);
    while (step_group_end < step_group_n) {
        nrn_multithread_job(nrn_fixed_step_group_thread,
                            step_group_n,
                            step_group_begin,
                            step_group_end);
#if NRNMPI
        nrn_spike_exchange(nrn_threads);
#endif

#if defined(ENABLE_BIN_REPORTS) || defined(ENABLE_SONATA_REPORTS)
        {
            Instrumentor::phase p("flush_reports");
            nrn_flush_reports(nrn_threads[0]._t);
        }
#endif
        if (stoprun) {
            break;
        }
        step_group_begin = step_group_end;
        progress_bar.update(step_group_end, nrn_threads[0]._t);
    }
    t = nrn_threads[0]._t;
}

static void nrn_fixed_step_group_thread(NrnThread* nth,
                                        int step_group_max,
                                        int step_group_begin,
                                        int& step_group_end) {
    nth->_stop_stepping = 0;
    for (int i = step_group_begin; i < step_group_max; ++i) {
        Instrumentor::phase p_timestep("timestep");
        nrn_fixed_step_thread(nth);
        if (nth->_stop_stepping) {
            if (nth->id == 0) {
                step_group_end = i + 1;
            }
            nth->_stop_stepping = 0;
            return;
        }
    }
    if (nth->id == 0) {
        step_group_end = step_group_max;
    }
}

void update(NrnThread* _nt) {
    double* vec_v = &(VEC_V(0));
    double* vec_rhs = &(VEC_RHS(0));
    int i2 = _nt->end;

    /* do not need to worry about linmod or extracellular*/
    if (secondorder) {
        nrn_pragma_acc(parallel loop present(vec_v [0:i2], vec_rhs [0:i2]) if (_nt->compute_gpu)
                           async(_nt->stream_id))
        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
        for (int i = 0; i < i2; ++i) {
            vec_v[i] += 2. * vec_rhs[i];
        }
    } else {
        nrn_pragma_acc(parallel loop present(vec_v [0:i2], vec_rhs [0:i2]) if (_nt->compute_gpu)
                           async(_nt->stream_id))
        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
        for (int i = 0; i < i2; ++i) {
            vec_v[i] += vec_rhs[i];
        }
    }

    if (_nt->tml) {
        assert(_nt->tml->index == CAP);
        nrn_cur_capacitance(_nt, _nt->tml->ml, _nt->tml->index);
    }
    if (nrn_use_fast_imem) {
        nrn_calc_fast_imem(_nt);
    }
}

void nonvint(NrnThread* _nt) {
    if (nrn_have_gaps) {
        Instrumentor::phase p("gap-v-transfer");
        nrnthread_v_transfer(_nt);
    }
    errno = 0;

    Instrumentor::phase_begin("state-update");
    for (auto tml = _nt->tml; tml; tml = tml->next)
        if (corenrn.get_memb_func(tml->index).state) {
            mod_f_t s = corenrn.get_memb_func(tml->index).state;
            std::string ss("state-");
            ss += nrn_get_mechname(tml->index);
            {
                Instrumentor::phase p(ss.c_str());
                (*s)(_nt, tml->ml, tml->index);
            }
#ifdef DEBUG
            if (errno) {
                hoc_warning("errno set during calculation of states", nullptr);
            }
#endif
        }
    Instrumentor::phase_end("state-update");
}

void nrn_ba(NrnThread* nt, int bat) {
    for (auto tbl = nt->tbl[bat]; tbl; tbl = tbl->next) {
        mod_f_t f = tbl->bam->f;
        int type = tbl->bam->type;
        Memb_list* ml = tbl->ml;
        (*f)(nt, ml, type);
    }
}

void nrncore2nrn_send_init() {
    if (nrn2core_trajectory_values_ == nullptr) {
        // standalone execution : no callbacks
        return;
    }
    // if per time step transfer, need to call nrn_record_init() in NEURON.
    // if storing full trajectories in CoreNEURON, need to initialize
    // vsize for all the trajectory requests.
    (*nrn2core_trajectory_values_)(-1, 0, nullptr, 0.0);
    for (int tid = 0; tid < nrn_nthread; ++tid) {
        NrnThread& nt = nrn_threads[tid];
        if (nt.trajec_requests) {
            nt.trajec_requests->vsize = 0;
        }
    }
}

void nrncore2nrn_send_values(NrnThread* nth) {
    if (nrn2core_trajectory_values_ == nullptr) {
        // standalone execution : no callbacks
        return;
    }

    TrajectoryRequests* tr = nth->trajec_requests;
    if (tr) {
        if (tr->varrays) {  // full trajectories into Vector data
            int vs = tr->vsize++;
            // make sure we do not overflow the `varrays` buffers
            assert(vs < tr->bsize);

            nrn_pragma_acc(parallel loop present(tr [0:1]) if (nth->compute_gpu)
                               async(nth->stream_id))
            nrn_pragma_omp(target teams distribute parallel for simd if(nth->compute_gpu))
            for (int i = 0; i < tr->n_trajec; ++i) {
                tr->varrays[i][vs] = *tr->gather[i];
            }
        } else if (tr->scatter) {  // scatter to NEURON and notify each step.
            nrn_assert(nrn2core_trajectory_values_);
            // Note that this is rather inefficient: we generate one `acc update
            // self` call for each `double` value (voltage, membrane current,
            // mechanism property, ...) that is being recorded, even though in most
            // cases these values will actually fall in a small number of contiguous
            // ranges in memory. A better solution, if the performance of this
            // branch becomes limiting, might be to offload this loop to the
            // device and populate some `scatter_values` array there and copy it
            // back with a single transfer. Note that the `async` clause here
            // should guarantee that correct values are reported even of
            // mechanism data that is updated in `nrn_state`. See also:
            // https://github.com/BlueBrain/CoreNeuron/issues/611
            for (int i = 0; i < tr->n_trajec; ++i) {
                double* gather_i = tr->gather[i];
                static_cast<void>(gather_i);
                nrn_pragma_acc(update self(gather_i [0:1]) if (nth->compute_gpu)
                                   async(nth->stream_id))
                nrn_pragma_omp(target update from(gather_i [0:1]) if (nth->compute_gpu))
            }
            nrn_pragma_acc(wait(nth->stream_id))
            for (int i = 0; i < tr->n_trajec; ++i) {
                *(tr->scatter[i]) = *(tr->gather[i]);
            }
            (*nrn2core_trajectory_values_)(nth->id, tr->n_pr, tr->vpr, nth->_t);
        }
    }
}

static void* nrn_fixed_step_thread(NrnThread* nth) {
    /* check thresholds and deliver all (including binqueue)
       events up to t+dt/2 */
    {
        Instrumentor::phase p("deliver-events");
        deliver_net_events(nth);
    }

    nth->_t += .5 * nth->_dt;

    if (nth->ncell) {
        /*@todo: do we need to update nth->_t on GPU: Yes (Michael, but can
        launch kernel) */
        nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->stream_id))
        nrn_pragma_acc(wait(nth->stream_id))
        nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu))
        fixed_play_continuous(nth);

        {
            Instrumentor::phase p("setup-tree-matrix");
            setup_tree_matrix_minimal(nth);
        }

        {
            Instrumentor::phase p("matrix-solver");
            nrn_solve_minimal(nth);
        }

        {
            Instrumentor::phase p("second-order-cur");
            second_order_cur(nth, secondorder);
        }

        {
            Instrumentor::phase p("update");
            update(nth);
        }
    }
    if (!nrn_have_gaps) {
        nrn_fixed_step_lastpart(nth);
    }
    return nullptr;
}

void* nrn_fixed_step_lastpart(NrnThread* nth) {
    nth->_t += .5 * nth->_dt;

    if (nth->ncell) {
        /*@todo: do we need to update nth->_t on GPU */
        nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->stream_id))
        nrn_pragma_acc(wait(nth->stream_id))
        nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu))
        fixed_play_continuous(nth);
        nonvint(nth);
        nrn_ba(nth, AFTER_SOLVE);
        nrn_ba(nth, BEFORE_STEP);
        nrncore2nrn_send_values(nth);  // consistent with NEURON. (after BEFORE_STEP)
    } else {
        nrncore2nrn_send_values(nth);
    }

    {
        Instrumentor::phase p("deliver-events");
        nrn_deliver_events(nth); /* up to but not past texit */
    }

    return nullptr;
}
}  // namespace coreneuron


================================================
FILE: coreneuron/sim/fast_imem.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#include "coreneuron/nrnconf.h"
#include "coreneuron/sim/fast_imem.hpp"
#include "coreneuron/utils/memory.h"
#include "coreneuron/mpi/nrnmpi.h"
#include "coreneuron/utils/nrnoc_aux.hpp"

namespace coreneuron {

extern int nrn_nthread;
extern NrnThread* nrn_threads;
bool nrn_use_fast_imem;

void fast_imem_free() {
    for (auto nt = nrn_threads; nt < nrn_threads + nrn_nthread; ++nt) {
        if (nt->nrn_fast_imem) {
            free_memory(nt->nrn_fast_imem->nrn_sav_rhs);
            free_memory(nt->nrn_fast_imem->nrn_sav_d);
            free_memory(nt->nrn_fast_imem);
            nt->nrn_fast_imem = nullptr;
        }
    }
}

void nrn_fast_imem_alloc() {
    if (nrn_use_fast_imem) {
        fast_imem_free();
        for (auto nt = nrn_threads; nt < nrn_threads + nrn_nthread; ++nt) {
            int n = nt->end;
            nt->nrn_fast_imem = (NrnFastImem*) ecalloc_align(1, sizeof(NrnFastImem));
            nt->nrn_fast_imem->nrn_sav_rhs = (double*) ecalloc_align(n, sizeof(double));
            nt->nrn_fast_imem->nrn_sav_d = (double*) ecalloc_align(n, sizeof(double));
        }
    }
}

void nrn_calc_fast_imem(NrnThread* nt) {
    int i1 = 0;
    int i3 = nt->end;

    double* vec_rhs = nt->_actual_rhs;
    double* vec_area = nt->_actual_area;

    double* fast_imem_d = nt->nrn_fast_imem->nrn_sav_d;
    double* fast_imem_rhs = nt->nrn_fast_imem->nrn_sav_rhs;
    nrn_pragma_acc(
        parallel loop present(vec_rhs, vec_area, fast_imem_d, fast_imem_rhs) if (nt->compute_gpu)
            async(nt->stream_id))
    nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
    for (int i = i1; i < i3; ++i) {
        fast_imem_rhs[i] = (fast_imem_d[i] * vec_rhs[i] + fast_imem_rhs[i]) * vec_area[i] * 0.01;
    }
}

void nrn_calc_fast_imem_init(NrnThread* nt) {
    // See the corresponding NEURON nrn_calc_fast_imem_fixedstep_init
    int i1 = 0;
    int i3 = nt->end;

    double* vec_rhs = nt->_actual_rhs;
    double* vec_area = nt->_actual_area;

    double* fast_imem_rhs = nt->nrn_fast_imem->nrn_sav_rhs;
    nrn_pragma_acc(parallel loop present(vec_rhs, vec_area, fast_imem_rhs) if (nt->compute_gpu)
                       async(nt->stream_id))
    nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
    for (int i = i1; i < i3; ++i) {
        fast_imem_rhs[i] = (vec_rhs[i] + fast_imem_rhs[i]) * vec_area[i] * 0.01;
    }
}

}  // namespace coreneuron


================================================
FILE: coreneuron/sim/fast_imem.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#pragma once

#include "coreneuron/sim/multicore.hpp"

namespace coreneuron {

/* Bool global variable to define if the fast_imem
 * calculations should be enabled.
 */
extern bool nrn_use_fast_imem;

/* Free memory allocated for the fast current membrane calculation.
 * Found in src/nrnoc/multicore.c in NEURON.
 */
void fast_imem_free();

/* fast_imem_alloc() wrapper.
 * Found in src/nrnoc/multicore.c in NEURON.
 */
void nrn_fast_imem_alloc();

/* Calculate the new values of rhs array at every timestep.
 * Found in src/nrnoc/fadvance.cpp in NEURON.
 */

void nrn_calc_fast_imem(NrnThread* _nt);
/* Initialization used only in offline (file) mode.
 * See NEURON nrn_calc_fast_imem_fixedstep_init in src/nrnoc/fadvance.cpp
 */
void nrn_calc_fast_imem_init(NrnThread* _nt);

}  // namespace coreneuron


================================================
FILE: coreneuron/sim/finitialize.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#include "coreneuron/nrnconf.h"
#include "coreneuron/network/netpar.hpp"
#include "coreneuron/network/netcvode.hpp"
#include "coreneuron/sim/fast_imem.hpp"
#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/utils/profile/profiler_interface.h"
#include "coreneuron/coreneuron.hpp"

namespace coreneuron {

bool _nrn_skip_initmodel;

void allocate_data_in_mechanism_nrn_init() {
    // In case some nrn_init allocates data that we need. In this case
    // we want to call nrn_init but not execute initmodel i.e. INITIAL
    // block. For this, set _nrn_skip_initmodel to True temporarily
    // , execute nrn_init and return.
    _nrn_skip_initmodel = true;
    for (int i = 0; i < nrn_nthread; ++i) {  // could be parallel
        NrnThread& nt = nrn_threads[i];
        for (NrnThreadMembList* tml = nt.tml; tml; tml = tml->next) {
            Memb_list* ml = tml->ml;
            mod_f_t s = corenrn.get_memb_func(tml->index).initialize;
            if (s) {
                (*s)(&nt, ml, tml->index);
            }
        }
    }
    _nrn_skip_initmodel = false;
}

void nrn_finitialize(int setv, double v) {
    Instrumentor::phase_begin("finitialize");
    t = 0.;
    dt2thread(-1.);
    nrn_thread_table_check();
    clear_event_queue();
    nrn_spike_exchange_init();
#if VECTORIZE
    nrn_play_init(); /* Vector.play */
                     /// Play events should be executed before initializing events
    for (int i = 0; i < nrn_nthread; ++i) {
        nrn_deliver_events(nrn_threads + i); /* The play events at t=0 */
    }
    if (setv) {
        for (auto _nt = nrn_threads; _nt < nrn_threads + nrn_nthread; ++_nt) {
            double* vec_v = &(VEC_V(0));
            nrn_pragma_acc(
                parallel loop present(_nt [0:1], vec_v [0:_nt->end]) if (_nt->compute_gpu))
            nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
            for (int i = 0; i < _nt->end; ++i) {
                vec_v[i] = v;
            }
        }
    }

    if (nrn_have_gaps) {
        Instrumentor::phase p("gap-v-transfer");
        nrnmpi_v_transfer();
        for (int i = 0; i < nrn_nthread; ++i) {
            nrnthread_v_transfer(nrn_threads + i);
        }
    }

    for (int i = 0; i < nrn_nthread; ++i) {
        nrn_ba(nrn_threads + i, BEFORE_INITIAL);
    }
    /* the INITIAL blocks are ordered so that mechanisms that write
       concentrations are after ions and before mechanisms that read
       concentrations.
    */
    /* the memblist list in NrnThread is already so ordered */
    for (int i = 0; i < nrn_nthread; ++i) {
        NrnThread* nt = nrn_threads + i;
        for (auto tml = nt->tml; tml; tml = tml->next) {
            mod_f_t s = corenrn.get_memb_func(tml->index).initialize;
            if (s) {
                (*s)(nt, tml->ml, tml->index);
            }
        }
    }
#endif

    init_net_events();
    for (int i = 0; i < nrn_nthread; ++i) {
        nrn_ba(nrn_threads + i, AFTER_INITIAL);
    }
    for (int i = 0; i < nrn_nthread; ++i) {
        nrn_deliver_events(nrn_threads + i); /* The INITIAL sent events at t=0 */
    }
    for (int i = 0; i < nrn_nthread; ++i) {
        setup_tree_matrix_minimal(nrn_threads + i);
        if (nrn_use_fast_imem) {
            nrn_calc_fast_imem_init(nrn_threads + i);
        }
    }
    for (int i = 0; i < nrn_nthread; ++i) {
        nrn_ba(nrn_threads + i, BEFORE_STEP);
    }
    nrncore2nrn_send_init();
    for (int i = 0; i < nrn_nthread; ++i) {
        nrncore2nrn_send_values(nrn_threads + i);
    }
    // Consistent with NEURON. BEFORE_STEP and fixed_record_continuous before nrn_deliver_events.
    for (int i = 0; i < nrn_nthread; ++i) {
        nrn_deliver_events(nrn_threads + i); /* The record events at t=0 */
    }
#if NRNMPI
    nrn_spike_exchange(nrn_threads);
#endif
    Instrumentor::phase_end("finitialize");
}
}  // namespace coreneuron


================================================
FILE: coreneuron/sim/multicore.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#include <cstdlib>
#include <vector>

#include "coreneuron/nrnconf.h"
#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/utils/memory.h"
#include "coreneuron/coreneuron.hpp"
#include "coreneuron/utils/nrnoc_aux.hpp"

/*
Now that threads have taken over the actual_v, v_node, etc, it might
be a good time to regularize the method of freeing, allocating, and
updating those arrays. To recapitulate the history, Node used to
be the structure that held direct values for v, area, d, rhs, etc.
That continued to hold for the cray vectorization project which
introduced v_node, v_parent, memb_list. Cache efficiency introduced
actual_v, actual_area, actual_d, etc and the Node started pointing
into those arrays. Additional nodes after allocation required updating
pointers to v and area since those arrays were freed and reallocated.
Now, the threads hold all these arrays and we want to update them
properly under the circumstances of changing topology, changing
number of threads, and changing distribution of cells on threads.
Note there are no longer global versions of any of these arrays.
We do not want to update merely due to a change in area. Recently
we have dealt with diam, area, ri on a section basis. We generally
desire an update just before a simulation when the efficient
structures are necessary. This is reasonably well handled by the
v_structure_change flag which historically freed and reallocated
v_node and v_parent and, just before this comment,
ended up setting the NrnThread tml. This makes most of the old
memb_list vestigial and we now got rid of it except for
the artificial cells (and it is possibly not really necessary there).
Switching between sparse and tree matrix just cause freeing and
reallocation of actual_rhs.

If we can get the freeing, reallocation, and pointer update correct
for _actual_v, I am guessing everything else can be dragged along with
it. We have two major cases, call to pc.nthread and change in
model structure. We want to use Node* as much as possible and defer
the handling of v_structure_change as long as possible.
*/

namespace coreneuron {

CoreNeuron corenrn;

int nrn_nthread = 0;
NrnThread* nrn_threads = nullptr;
void (*nrn_mk_transfer_thread_data_)();

/// --> CoreNeuron class
static int table_check_cnt_;
static ThreadDatum* table_check_;


NrnThreadMembList* create_tml(NrnThread& nt,
                              int mech_id,
                              Memb_func& memb_func,
                              int& shadow_rhs_cnt,
                              const std::vector<int>& mech_types,
                              const std::vector<int>& nodecounts) {
    auto tml = (NrnThreadMembList*) emalloc_align(sizeof(NrnThreadMembList), 0);
    tml->next = nullptr;
    tml->index = mech_types[mech_id];

    tml->ml = (Memb_list*) ecalloc_align(1, sizeof(Memb_list), 0);
    tml->ml->_net_receive_buffer = nullptr;
    tml->ml->_net_send_buffer = nullptr;
    tml->ml->_permute = nullptr;
    if (memb_func.alloc == nullptr) {
        hoc_execerror(memb_func.sym, "mechanism does not exist");
    }
    tml->ml->nodecount = nodecounts[mech_id];
    if (!memb_func.sym) {
        printf("%s (type %d) is not available\n", nrn_get_mechname(tml->index), tml->index);
        exit(1);
    }
    tml->ml->_nodecount_padded = nrn_soa_padded_size(tml->ml->nodecount,
                                                     corenrn.get_mech_data_layout()[tml->index]);
    if (memb_func.is_point && corenrn.get_is_artificial()[tml->index] == 0) {
        // Avoid race for multiple PointProcess instances in same compartment.
        if (tml->ml->nodecount > shadow_rhs_cnt) {
            shadow_rhs_cnt = tml->ml->nodecount;
        }
    }

    if (auto* const priv_ctor = corenrn.get_memb_func(tml->index).private_constructor) {
        priv_ctor(&nt, tml->ml, tml->index);
    }

    return tml;
}

void nrn_threads_create(int n) {
    if (nrn_nthread != n) {
        /*printf("sizeof(NrnThread)=%d   sizeof(Memb_list)=%d\n", sizeof(NrnThread),
         * sizeof(Memb_list));*/

        nrn_threads = nullptr;
        nrn_nthread = n;
        if (n > 0) {
            nrn_threads = new NrnThread[n];
            for (int i = 0; i < nrn_nthread; ++i) {
                NrnThread& nt = nrn_threads[i];
                nt.id = i;
                for (int j = 0; j < BEFORE_AFTER_SIZE; ++j) {
                    nt.tbl[j] = nullptr;
                }
            }
        }
        v_structure_change = 1;
        diam_changed = 1;
    }
    /*printf("nrn_threads_create %d %d\n", nrn_nthread, nrn_thread_parallel_);*/
}

void nrn_threads_free() {
    if (nrn_nthread) {
        delete[] nrn_threads;
        nrn_threads = nullptr;
        nrn_nthread = 0;
    }
}

void nrn_mk_table_check() {
    if (table_check_) {
        free((void*) table_check_);
        table_check_ = nullptr;
    }
    auto& memb_func = corenrn.get_memb_funcs();
    // Allocate int array of size of mechanism types
    std::vector<int> ix(memb_func.size(), -1);
    table_check_cnt_ = 0;
    for (int id = 0; id < nrn_nthread; ++id) {
        auto& nt = nrn_threads[id];
        for (auto tml = nt.tml; tml; tml = tml->next) {
            int index = tml->index;
            if (memb_func[index].thread_table_check_ && ix[index] == -1) {
                ix[index] = id;
                table_check_cnt_ += 2;
            }
        }
    }
    if (table_check_cnt_) {
        table_check_ = (ThreadDatum*) emalloc(table_check_cnt_ * sizeof(ThreadDatum));
    }
    int i = 0;
    for (int id = 0; id < nrn_nthread; ++id) {
        auto& nt = nrn_threads[id];
        for (auto tml = nt.tml; tml; tml = tml->next) {
            int index = tml->index;
            if (memb_func[index].thread_table_check_ && ix[index] == id) {
                table_check_[i++].i = id;
                table_check_[i++]._pvoid = (void*) tml;
            }
        }
    }
}

void nrn_thread_table_check() {
    for (int i = 0; i < table_check_cnt_; i += 2) {
        auto& nt = nrn_threads[table_check_[i].i];
        auto tml = static_cast<NrnThreadMembList*>(table_check_[i + 1]._pvoid);
        Memb_list* ml = tml->ml;
        (*corenrn.get_memb_func(tml->index).thread_table_check_)(
            0, ml->_nodecount_padded, ml->data, ml->pdata, ml->_thread, &nt, ml, tml->index);
    }
}
}  // namespace coreneuron


================================================
FILE: coreneuron/sim/multicore.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#pragma once

#include "coreneuron/nrnconf.h"
#include "coreneuron/mechanism/membfunc.hpp"
#include "coreneuron/utils/memory.h"
#include "coreneuron/mpi/nrnmpi.h"
#include "coreneuron/mpi/core/nrnmpi.hpp"
#include "coreneuron/io/reports/nrnreport.hpp"
#include <vector>
#include <memory>

namespace coreneuron {
class NetCon;
class PreSyn;

extern bool use_solve_interleave;

/*
   Point_process._presyn, used only if its NET_RECEIVE sends a net_event, is
   eliminated. Needed only by net_event function. Replaced by
   PreSyn* = nt->presyns + nt->pnt2presyn_ix[pnttype2presyn[pnt->_type]][pnt->_i_instance];
*/

struct NrnThreadMembList { /* patterned after CvMembList in cvodeobj.h */
    NrnThreadMembList* next;
    Memb_list* ml;
    int index;
    int* dependencies; /* list of mechanism types that this mechanism depends on*/
    int ndependencies; /* for scheduling we need to know the dependency count */
};
NrnThreadMembList* create_tml(NrnThread& nt,
                              int mech_id,
                              Memb_func& memb_func,
                              int& shadow_rhs_cnt,
                              const std::vector<int>& mech_types,
                              const std::vector<int>& nodecounts);

struct NrnThreadBAList {
    Memb_list* ml; /* an item in the NrnThreadMembList */
    BAMech* bam;
    NrnThreadBAList* next;
};

struct NrnFastImem {
    double* nrn_sav_rhs;
    double* nrn_sav_d;
};

struct TrajectoryRequests {
    void** vpr;       /* PlayRecord Objects known by NEURON */
    double** scatter; /* if bsize == 0, each time step */
    double** varrays; /* if bsize > 0, the Vector data pointers. */
    double** gather;  /* pointers to values that get scattered to NEURON */
    int n_pr;         /* number of PlayRecord instances */
    int n_trajec;     /* number of trajectories requested */
    int bsize;        /* buffer size of the Vector data */
    int vsize;        /* number of elements in varrays so far */
};

/* for OpenACC, in order to avoid an error while update PreSyn, with virtual base
 * class, we are adding helper with flag variable which could be updated on GPU
 */
struct PreSynHelper {
    int flag_;
};

struct NrnThread: public MemoryManaged {
    double _t = 0;
    double _dt = -1e9;
    double cj = 0.0;

    NrnThreadMembList* tml = nullptr;
    Memb_list** _ml_list = nullptr;
    Point_process* pntprocs = nullptr;  // synapses and artificial cells with and without gid
    PreSyn* presyns = nullptr;          // all the output PreSyn with and without gid
    PreSynHelper* presyns_helper = nullptr;
    int** pnt2presyn_ix = nullptr;  // eliminates Point_process._presyn used only by net_event
                                    // sender.
    NetCon* netcons = nullptr;
    double* weights = nullptr;  // size n_weight. NetCon.weight_ points into this array.

    int n_pntproc = 0;
    int n_weight = 0;
    int n_netcon = 0;
    int n_input_presyn = 0;
    int n_presyn = 0;       // only for model_size
    int n_real_output = 0;  // for checking their thresholds.

    int ncell = 0; /* analogous to old rootnodecount */
    int end = 0;   /* 1 + position of last in v_node array. Now v_node_count. */
    int id = 0;    /* this is nrn_threads[id] */
    int _stop_stepping = 0;
    int n_vecplay = 0; /* number of instances of VecPlayContinuous */

    size_t _ndata = 0;
    size_t _nvdata = 0;
    size_t _nidata = 0;        /* sizes */
    double* _data = nullptr;   /* all the other double* and Datum to doubles point into here*/
    int* _idata = nullptr;     /* all the Datum to ints index into here */
    void** _vdata = nullptr;   /* all the Datum to pointers index into here */
    void** _vecplay = nullptr; /* array of instances of VecPlayContinuous */

    double* _actual_rhs = nullptr;
    double* _actual_d = nullptr;
    double* _actual_a = nullptr;
    double* _actual_b = nullptr;
    double* _actual_v = nullptr;
    double* _actual_area = nullptr;
    double* _actual_diam = nullptr; /* nullptr if no mechanism has dparam with diam semantics */
    double* _shadow_rhs = nullptr;  /* Not pointer into _data. Avoid race for multiple POINT_PROCESS
                             in same  compartment */
    double* _shadow_d = nullptr; /* Not pointer into _data. Avoid race for multiple POINT_PROCESS in
                          same compartment */

    /* Fast membrane current calculation struct */
    NrnFastImem* nrn_fast_imem = nullptr;

    int* _v_parent_index = nullptr;
    int* _permute = nullptr;
    char* _sp13mat = nullptr;              /* handle to general sparse matrix */
    Memb_list* _ecell_memb_list = nullptr; /* normally nullptr */

    double _ctime = 0.0; /* computation time in seconds (using nrnmpi_wtime) */

    NrnThreadBAList* tbl[BEFORE_AFTER_SIZE]; /* wasteful since almost all empty */

    int shadow_rhs_cnt = 0; /* added to facilitate the NrnThread transfer to GPU */
    int compute_gpu = 0;    /* define whether to compute with gpus */
    int stream_id = 0;      /* define where the kernel will be launched on GPU stream */
    int _net_send_buffer_size = 0;
    int _net_send_buffer_cnt = 0;
    int* _net_send_buffer = nullptr;

    int* _watch_types = nullptr; /* nullptr or 0 terminated array of integers */
    void* mapping = nullptr;     /* section to segment mapping information */
    std::unique_ptr<SummationReportMapping> summation_report_handler_; /* report to ALU (values of
                                                                          the current summation */
    TrajectoryRequests* trajec_requests = nullptr; /* per time step values returned to NEURON */

    /* Needed in case there are FOR_NETCON statements in use. */
    std::size_t _fornetcon_perm_indices_size{}; /* length of _fornetcon_perm_indices */
    size_t* _fornetcon_perm_indices{};          /* displacement like list of indices */
    std::size_t _fornetcon_weight_perm_size{};  /* length of _fornetcon_weight_perm */
    size_t* _fornetcon_weight_perm{};           /* permutation indices into weight */

    std::vector<int> _pnt_offset; /* for SelfEvent queue transfer */
};

extern void nrn_threads_create(int n);
extern int nrn_nthread;
extern NrnThread* nrn_threads;
template <typename F, typename... Args>
void nrn_multithread_job(F&& job, Args&&... args) {
    int i;
    // clang-format off

    #pragma omp parallel for private(i) shared(nrn_threads, job, nrn_nthread, \
                                           nrnmpi_myid) schedule(static, 1)
    // FIXME: multiple forwarding of the same arguments...
    for (i = 0; i < nrn_nthread; ++i) {
        job(nrn_threads + i, std::forward<Args>(args)...);
    }
    // clang-format on
}

extern void nrn_thread_table_check(void);

extern void nrn_threads_free(void);

extern bool _nrn_skip_initmodel;


extern void dt2thread(double);
extern void clear_event_queue(void);
extern void nrn_ba(NrnThread*, int);
extern void* nrn_fixed_step_lastpart(NrnThread*);
extern void nrn_solve_minimal(NrnThread*);
extern void nrncore2nrn_send_init();
extern void* setup_tree_matrix_minimal(NrnThread*);
extern void nrncore2nrn_send_values(NrnThread*);
extern void nrn_fixed_step_group_minimal(int total_sim_steps);
extern void nrn_fixed_single_steps_minimal(int total_sim_steps, double tstop);
extern void nrn_fixed_step_minimal(void);
extern void nrn_finitialize(int setv, double v);
extern void direct_mode_initialize();
extern void nrn_mk_table_check(void);
extern void nonvint(NrnThread* _nt);
extern void update(NrnThread*);

constexpr int at_time(NrnThread* nt, double te) {
    double x = te - 1e-11;
    if (x <= nt->_t && x > (nt->_t - nt->_dt)) {
        return 1;
    }
    return 0;
}
}  // namespace coreneuron


================================================
FILE: coreneuron/sim/scopmath/abort.cpp
================================================
/******************************************************************************
 *
 * File: abort.c
 *
 * Copyright (c) 1984, 1985, 1986, 1987, 1988, 1989, 1990
 *   Duke University
 *
 ******************************************************************************/
#include "coreneuron/utils/nrnoc_aux.hpp"

/*-----------------------------------------------------------------------------
 *
 * ABORT_RUN()
 *
 *    Prints out an error message and returns to the main menu if a solver
 *    routine returns a nonzero error code.
 *
 * Calling sequence: abort_run(code)
 *
 * Argument:	code	int	flag for error
 *
 * Returns:
 *
 * Functions called: abs(), cls(), cursrpos(), puts(), gets()
 *
 * Files accessed:
 *---------------------------------------------------------------------------*/

#include <setjmp.h>
#include <stdio.h>
#include "errcodes.h"
namespace coreneuron {
int abort_run(int code) {
    switch ((code >= 0) ? code : -code) {
        case EXCEED_ITERS:
            puts("Convergence not achieved in maximum number of iterations");
            break;
        case SINGULAR:
            puts("The matrix in the solution method is singular or ill-conditioned");
            break;
        case PRECISION:
            puts(
                "The increment in the independent variable is less than machine "
                "roundoff error");
            break;
        case CORR_FAIL:
            puts("The corrector failed to satisfy the error check");
            break;
        case DIVERGED:
            puts("The corrector iteration diverged");
            break;
        case INCONSISTENT:
            puts("Inconsistent boundary conditions");
            puts("Convergence not acheived in maximum number of iterations");
            break;
        case BAD_START:
            puts("Poor starting estimate for initial conditions");
            puts("The matrix in the solution method is singular or ill-conditioned");
            break;
        case NODATA:
            puts("No data found in data file");
            break;
        case NO_SOLN:
            puts("No solution was obtained for the coefficients");
            break;
        case LOWMEM:
            puts("Insufficient memory to run the model");
            break;
        case DIVCHECK:
            puts("Attempt to divide by zero");
            break;
        case NOFORCE:
            puts(
                "Could not open forcing function file\nThe model cannot be run "
                "without the forcing function");
            break;
        case NEG_ARG:
            puts("Cannot compute factorial of negative argument");
            break;
        case RANGE:
            puts(
                "Value of variable is outside the range of the forcing function data "
                "table");
            break;
        default:
            puts("Origin of error is unknown");
    }
    hoc_execerror("scopmath library error", (char*) 0);
    return 0;
}
}  // namespace coreneuron


================================================
FILE: coreneuron/sim/scopmath/crout_thread.hpp
================================================
/*
# =============================================================================
# Originally crout.c from SCoP library, Copyright (c) 1987-90 Duke University
# =============================================================================
# Subsequent extensive prototype and memory layout changes for CoreNEURON
#
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/
#pragma once
#include "coreneuron/sim/scopmath/errcodes.h"
#include "coreneuron/sim/scopmath/newton_struct.h"

namespace coreneuron {
#if defined(scopmath_crout_ix) || defined(scopmath_crout_y) || defined(scopmath_crout_b)
#error "naming clash on crout_thread.hpp-internal macros"
#endif
#define scopmath_crout_b(arg)  b[scopmath_crout_ix(arg)]
#define scopmath_crout_ix(arg) ((arg) *_STRIDE)
#define scopmath_crout_y(arg)  _p[y[arg] * _STRIDE]

/**
 * Performs an LU triangular factorization of a real matrix by the Crout
 * algorithm using partial pivoting. Rows are not normalized; implicit
 * equilibration is used. ROUNDOFF is the minimal value for a pivot element
 * without its being considered too close to zero (currently set to 1.0E-20).
 *
 * @return 0 if no error; 2 if matrix is singular or ill-conditioned
 * @param n number of rows of the matrix
 * @param a double precision matrix to be factored
 * @param[out] a factors required to transform the constant vector in the set of
 *               simultaneous equations are stored in the lower triangle;
 *               factors for back substitution are stored in the upper triangle.
 * @param[out] perm permutation vector to store row interchanges
 *
 * @note Having a differnt permutation per instance may not be a good idea.
 */
inline int nrn_crout_thread(NewtonSpace* ns, int n, double** a, int* perm, _threadargsproto_) {
    int save_i = 0;

    /* Initialize permutation and rowmax vectors */
    double* rowmax = ns->rowmax;
    for (int i = 0; i < n; i++) {
        perm[scopmath_crout_ix(i)] = i;
        int k = 0;
        for (int j = 1; j < n; j++)
            if (fabs(a[i][scopmath_crout_ix(j)]) > fabs(a[i][scopmath_crout_ix(k)]))
                k = j;
        rowmax[scopmath_crout_ix(i)] = a[i][scopmath_crout_ix(k)];
    }

    /* Loop over rows and columns r */
    for (int r = 0; r < n; r++) {
        /*
         * Operate on rth column.  This produces the lower triangular matrix
         * of terms needed to transform the constant vector.
         */

        for (int i = r; i < n; i++) {
            double sum = 0.0;
            int irow = perm[scopmath_crout_ix(i)];
            for (int k = 0; k < r; k++) {
                int krow = perm[scopmath_crout_ix(k)];
                sum += a[irow][scopmath_crout_ix(k)] * a[krow][scopmath_crout_ix(r)];
            }
            a[irow][scopmath_crout_ix(r)] -= sum;
        }

        /* Find row containing the pivot in the rth column */
        int pivot = perm[scopmath_crout_ix(r)];
        double equil_1 = fabs(a[pivot][scopmath_crout_ix(r)] / rowmax[scopmath_crout_ix(pivot)]);
        for (int i = r + 1; i < n; i++) {
            int irow = perm[scopmath_crout_ix(i)];
            double equil_2 = fabs(a[irow][scopmath_crout_ix(r)] / rowmax[scopmath_crout_ix(irow)]);
            if (equil_2 > equil_1) {
                /* make irow the new pivot row */

                pivot = irow;
                save_i = i;
                equil_1 = equil_2;
            }
        }

        /* Interchange entries in permutation vector if necessary */
        if (pivot != perm[scopmath_crout_ix(r)]) {
            perm[scopmath_crout_ix(save_i)] = perm[scopmath_crout_ix(r)];
            perm[scopmath_crout_ix(r)] = pivot;
        }

        /* Check that pivot element is not too small */
        if (fabs(a[pivot][scopmath_crout_ix(r)]) < ROUNDOFF)
            return SINGULAR;

        /*
         * Operate on row in rth position.  This produces the upper
         * triangular matrix whose diagonal elements are assumed to be unity.
         * This matrix is used in the back substitution algorithm.
         */
        for (int j = r + 1; j < n; j++) {
            double sum = 0.0;
            for (int k = 0; k < r; k++) {
                int krow = perm[scopmath_crout_ix(k)];
                sum += a[pivot][scopmath_crout_ix(k)] * a[krow][scopmath_crout_ix(j)];
            }
            a[pivot][scopmath_crout_ix(j)] = (a[pivot][scopmath_crout_ix(j)] - sum) /
                                             a[pivot][scopmath_crout_ix(r)];
        }
    }
    return SUCCESS;
}

/**
 * Performs forward substitution algorithm to transform the constant vector in
 * the linear simultaneous equations to be consistent with the factored matrix.
 * Then performs back substitution to find the solution to the simultaneous
 * linear equations.
 *
 * @param n number of rows of the matrix
 * @param a double precision matrix containing the factored matrix of
 *          coefficients of the linear equations
 * @param b vector of function values
 * @param perm permutation vector to store row interchanges
 * @param[out] p[y[i]] contains the solution vector
 */
inline void nrn_scopmath_solve_thread(int n,
                                      double** a,
                                      double* b,
                                      int* perm,
                                      double* p,
                                      int* y,
                                      _threadargsproto_) {
    /* Perform forward substitution with pivoting */
    // if (y) { // pgacc bug. nullptr on cpu but not on GPU
    if (0) {
        for (int i = 0; i < n; i++) {
            int pivot = perm[scopmath_crout_ix(i)];
            double sum = 0.0;
            for (int j = 0; j < i; j++)
                sum += a[pivot][scopmath_crout_ix(j)] * (scopmath_crout_y(j));
            scopmath_crout_y(i) = (scopmath_crout_b(pivot) - sum) / a[pivot][scopmath_crout_ix(i)];
        }

        /*
         * Note that the y vector is already in the correct order for back
         * substitution.  Perform back substitution, pivoting the matrix but not
         * the y vector.  There is no need to divide by the diagonal element as
         * this is assumed to be unity.
         */

        for (int i = n - 1; i >= 0; i--) {
            int pivot = perm[scopmath_crout_ix(i)];
            double sum = 0.0;
            for (int j = i + 1; j < n; j++)
                sum += a[pivot][scopmath_crout_ix(j)] * (scopmath_crout_y(j));
            scopmath_crout_y(i) -= sum;
        }
    } else {
        for (int i = 0; i < n; i++) {
            int pivot = perm[scopmath_crout_ix(i)];
            double sum = 0.0;
            if (i > 0) {  // pgacc bug. with i==0 the following loop executes once
                for (int j = 0; j < i; j++) {
                    sum += a[pivot][scopmath_crout_ix(j)] * (p[scopmath_crout_ix(j)]);
                }
            }
            p[scopmath_crout_ix(i)] = (scopmath_crout_b(pivot) - sum) /
                                      a[pivot][scopmath_crout_ix(i)];
        }

        /*
         * Note that the y vector is already in the correct order for back
         * substitution.  Perform back substitution, pivoting the matrix but not
         * the y vector.  There is no need to divide by the diagonal element as
         * this is assumed to be unity.
         */
        for (int i = n - 1; i >= 0; i--) {
            int pivot = perm[scopmath_crout_ix(i)];
            double sum = 0.0;
            for (int j = i + 1; j < n; j++)
                sum += a[pivot][scopmath_crout_ix(j)] * (p[scopmath_crout_ix(j)]);
            p[scopmath_crout_ix(i)] -= sum;
        }
    }
}
#undef scopmath_crout_b
#undef scopmath_crout_ix
#undef scopmath_crout_y
}  // namespace coreneuron


================================================
FILE: coreneuron/sim/scopmath/errcodes.h
================================================
/*
# =============================================================================
# Originally errcodes.h from SCoP library, Copyright (c) 1984-90 Duke University
# =============================================================================
# Subsequent extensive prototype and memory layout changes for CoreNEURON
#
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/
#pragma once
namespace coreneuron {
extern int abort_run(int);
namespace scopmath {
/** @brief Flag to disable some code sections at compile time.
 *
 *  Some methods, such as coreneuron::scopmath::sparse::getelm(...), decide at
 *  runtime whether they are simply accessors, or if they dynamically modify the
 *  matrix in question, possibly allocating new memory. Typically the second
 *  mode will be used during model initialisation, while the first will be used
 *  during computation/simulation. Compiling the more complicated code for the
 *  second mode can be problematic for targets such as GPU, where dynamic
 *  allocation and global state are complex. This enum is intended to be used as
 *  a template parameter to flag (at compile time) when this code can be
 *  omitted.
 */
enum struct enabled_code { all, compute_only };
}  // namespace scopmath
}  // namespace coreneuron
#define ROUNDOFF       1.e-20
#define ZERO           1.e-8
#define STEP           1.e-6
#define CONVERGE       1.e-6
#define MAXCHANGE      0.05
#define INITSIMPLEX    0.25
#define MAXITERS       50
#define MAXSMPLXITERS  100
#define MAXSTEPS       20
#define MAXHALVE       15
#define MAXORDER       6
#define MAXTERMS       3
#define MAXFAIL        10
#define MAX_JAC_ITERS  20
#define MAX_GOOD_ORDER 2
#define MAX_GOOD_STEPS 3

#define SUCCESS      0
#define EXCEED_ITERS 1
#define SINGULAR     2
#define PRECISION    3
#define CORR_FAIL    4
#define INCONSISTENT 5
#define BAD_START    6
#define NODATA       7
#define NO_SOLN      8
#define LOWMEM       9
#define DIVCHECK     10
#define NOFORCE      11
#define DIVERGED     12
#define NEG_ARG      13
#define RANGE        14


================================================
FILE: coreneuron/sim/scopmath/newton_struct.h
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/
#pragma once
#include "coreneuron/mechanism/mech/mod2c_core_thread.hpp"

namespace coreneuron {

/* avoid incessant alloc/free memory */
struct NewtonSpace {
    int n;
    int n_instance;
    double* delta_x;
    double** jacobian;
    int* perm;
    double* high_value;
    double* low_value;
    double* rowmax;
};

void nrn_newtonspace_copyto_device(NewtonSpace* ns);
void nrn_newtonspace_delete_from_device(NewtonSpace* ns);

}  // namespace coreneuron


================================================
FILE: coreneuron/sim/scopmath/newton_thread.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/
#include <math.h>
#include <stdlib.h>

#include "coreneuron/sim/scopmath/newton_thread.hpp"
#include "coreneuron/utils/nrnoc_aux.hpp"

namespace coreneuron {
NewtonSpace* nrn_cons_newtonspace(int n, int n_instance) {
    NewtonSpace* ns = (NewtonSpace*) emalloc(sizeof(NewtonSpace));
    ns->n = n;
    ns->n_instance = n_instance;
    ns->delta_x = makevector(n * n_instance * sizeof(double));
    ns->jacobian = makematrix(n, n * n_instance);
    ns->perm = (int*) emalloc((unsigned) (n * n_instance * sizeof(int)));
    ns->high_value = makevector(n * n_instance * sizeof(double));
    ns->low_value = makevector(n * n_instance * sizeof(double));
    ns->rowmax = makevector(n * n_instance * sizeof(double));
    nrn_newtonspace_copyto_device(ns);
    return ns;
}

void nrn_destroy_newtonspace(NewtonSpace* ns) {
    nrn_newtonspace_delete_from_device(ns);
    free((char*) ns->perm);
    freevector(ns->delta_x);
    freematrix(ns->jacobian);
    freevector(ns->high_value);
    freevector(ns->low_value);
    freevector(ns->rowmax);
    free((char*) ns);
}
}  // namespace coreneuron


================================================
FILE: coreneuron/sim/scopmath/newton_thread.hpp
================================================
/*
# =============================================================================
# Originally newton.c from SCoP library, Copyright (c) 1987-90 Duke University
# =============================================================================
# Subsequent extensive prototype and memory layout changes for CoreNEURON
#
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/
#pragma once
#include "coreneuron/sim/scopmath/errcodes.h"
#include "coreneuron/sim/scopmath/newton_struct.h"
#include "coreneuron/sim/scopmath/crout_thread.hpp"

#include <algorithm>
#include <cmath>

namespace coreneuron {
#if defined(scopmath_newton_ix) || defined(scopmath_newton_s) || defined(scopmath_newton_x)
#error "naming clash on newton_thread.hpp-internal macros"
#endif
#define scopmath_newton_ix(arg) ((arg) *_STRIDE)
#define scopmath_newton_s(arg)  _p[s[arg] * _STRIDE]
#define scopmath_newton_x(arg)  _p[(arg) *_STRIDE]
namespace detail {
/**
 * @brief Calculate the Jacobian matrix using finite central differences.
 *
 * Creates the Jacobian matrix by computing partial derivatives by finite
 * central differences. If the column variable is nonzero, an increment of 2% of
 * the variable is used. STEP is the minimum increment allowed; it is currently
 * set to 1.0E-6.
 *
 * @param n number of variables
 * @param x pointer to array of addresses of the solution vector elements
 * @param p array of parameter values
 * @param func callable that computes the deviation from zero of each equation
 *             in the model
 * @param value pointer to array of addresses of function values
 * @param[out] jacobian computed jacobian matrix
 */
template <typename F>
void nrn_buildjacobian_thread(NewtonSpace* ns,
                              int n,
                              int* index,
                              F const& func,
                              double* value,
                              double** jacobian,
                              _threadargsproto_) {
    double* high_value = ns->high_value;
    double* low_value = ns->low_value;

    /* Compute partial derivatives by central finite differences */

    for (int j = 0; j < n; j++) {
        double increment = std::max(std::fabs(0.02 * (scopmath_newton_x(index[j]))), STEP);
        scopmath_newton_x(index[j]) += increment;
        func(_threadargs_);  // std::invoke in C++17
        for (int i = 0; i < n; i++)
            high_value[scopmath_newton_ix(i)] = value[scopmath_newton_ix(i)];
        scopmath_newton_x(index[j]) -= 2.0 * increment;
        func(_threadargs_);  // std::invoke in C++17
        for (int i = 0; i < n; i++) {
            low_value[scopmath_newton_ix(i)] = value[scopmath_newton_ix(i)];

            /* Insert partials into jth column of Jacobian matrix */

            jacobian[i][scopmath_newton_ix(j)] = (high_value[scopmath_newton_ix(i)] -
                                                  low_value[scopmath_newton_ix(i)]) /
                                                 (2.0 * increment);
        }

        /* Restore original variable and function values. */

        scopmath_newton_x(index[j]) += increment;
        func(_threadargs_);  // std::invoke in C++17
    }
}
#undef scopmath_newton_x
}  // namespace detail

/**
 * Iteratively solves simultaneous nonlinear equations by Newton's method, using
 * a Jacobian matrix computed by finite differences.
 *
 * @return 0 if no error; 2 if matrix is singular or ill-conditioned; 1 if
 *         maximum iterations exceeded.
 * @param n number of variables to solve for
 * @param x pointer to array of the solution vector elements possibly indexed by
 *          index
 * @param p array of parameter values
 * @param func callable that computes the deviation from zero of each equation
 *             in the model
 * @param value pointer to array to array of the function values
 * @param[out] x contains the solution value or the most recent iteration's
 *               result in the event of an error.
 */
template <typename F>
inline int nrn_newton_thread(NewtonSpace* ns,
                             int n,
                             int* s,
                             F func,
                             double* value,
                             _threadargsproto_) {
    int count = 0, error = 0;
    double change = 1.0, max_dev, temp;
    int done = 0;
    /*
     * Create arrays for Jacobian, variable increments, function values, and
     * permutation vector
     */
    double* delta_x = ns->delta_x;
    double** jacobian = ns->jacobian;
    int* perm = ns->perm;
    /* Iteration loop */
    while (!done) {
        if (count++ >= MAXITERS) {
            error = EXCEED_ITERS;
            done = 2;
        }
        if (!done && change > MAXCHANGE) {
            /*
             * Recalculate Jacobian matrix if solution has changed by more
             * than MAXCHANGE
             */
            detail::nrn_buildjacobian_thread(ns, n, s, func, value, jacobian, _threadargs_);
            for (int i = 0; i < n; i++)
                value[scopmath_newton_ix(i)] = -value[scopmath_newton_ix(i)]; /* Required correction
                                                                               * to
                                                                               * function values */
            error = nrn_crout_thread(ns, n, jacobian, perm, _threadargs_);
            if (error != SUCCESS) {
                done = 2;
            }
        }

        if (!done) {
            nrn_scopmath_solve_thread(n, jacobian, value, perm, delta_x, (int*) 0, _threadargs_);

            /* Update solution vector and compute norms of delta_x and value */

            change = 0.0;
            if (s) {
                for (int i = 0; i < n; i++) {
                    if (std::fabs(scopmath_newton_s(i)) > ZERO &&
                        (temp = std::fabs(delta_x[scopmath_newton_ix(i)] /
                                          (scopmath_newton_s(i)))) > change)
                        change = temp;
                    scopmath_newton_s(i) += delta_x[scopmath_newton_ix(i)];
                }
            } else {
                for (int i = 0; i < n; i++) {
                    if (std::fabs(scopmath_newton_s(i)) > ZERO &&
                        (temp = std::fabs(delta_x[scopmath_newton_ix(i)] /
                                          (scopmath_newton_s(i)))) > change)
                        change = temp;
                    scopmath_newton_s(i) += delta_x[scopmath_newton_ix(i)];
                }
            }
            // Evaulate function values with new solution.
            func(_threadargs_);  // std::invoke in C++17
            max_dev = 0.0;
            for (int i = 0; i < n; i++) {
                value[scopmath_newton_ix(i)] = -value[scopmath_newton_ix(i)]; /* Required correction
                                                                               * to function
                                                                               * values */
                if ((temp = std::fabs(value[scopmath_newton_ix(i)])) > max_dev)
                    max_dev = temp;
            }

            /* Check for convergence or maximum iterations */

            if (change <= CONVERGE && max_dev <= ZERO) {
                // break;
                done = 1;
            }
        }
    } /* end of while loop */

    return (error);
}
#undef scopmath_newton_ix
#undef scopmath_newton_s

NewtonSpace* nrn_cons_newtonspace(int n, int n_instance);
void nrn_destroy_newtonspace(NewtonSpace* ns);
}  // namespace coreneuron


================================================
FILE: coreneuron/sim/scopmath/sparse_thread.hpp
================================================
/*
# =============================================================================
# Originally sparse.c from SCoP library, Copyright (c) 1989-90 Duke University
# =============================================================================
# Subsequent extensive prototype and memory layout changes for CoreNEURON
#
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/
#pragma once
#include "coreneuron/mechanism/mech/mod2c_core_thread.hpp"
#include "coreneuron/sim/scopmath/errcodes.h"

namespace coreneuron {
namespace scopmath {
namespace sparse {
// Methods that may be called from offloaded regions are declared inline.
inline void delete_item(Item* item) {
    item->next->prev = item->prev;
    item->prev->next = item->next;
    item->prev = nullptr;
    item->next = nullptr;
}

/*link ii before item*/
inline void linkitem(Item* item, Item* ii) {
    ii->prev = item->prev;
    ii->next = item;
    item->prev = ii;
    ii->prev->next = ii;
}

inline void insert(SparseObj* so, Item* item) {
    Item* ii{};
    for (ii = so->orderlist->next; ii != so->orderlist; ii = ii->next) {
        if (ii->norder >= item->norder) {
            break;
        }
    }
    linkitem(ii, item);
}

/* note: solution order refers to the following
        diag[varord[row]]->row = row = diag[varord[row]]->col
        rowst[varord[row]]->row = row
        varord[el->row] < varord[el->c_right->row]
        varord[el->col] < varord[el->r_down->col]
*/
inline void increase_order(SparseObj* so, unsigned row) {
    /* order of row increases by 1. Maintain the orderlist. */
    if (!so->do_flag)
        return;
    Item* order = so->roworder[row];
    delete_item(order);
    order->norder++;
    insert(so, order);
}

/**
 * Return pointer to (row, col) element maintaining order in rows.
 *
 * See check_assert in minorder for info about how this matrix is supposed to
 * look. If new_elem is nonzero and an element would otherwise be created, new
 * is used instead. This is because linking an element is highly nontrivial. The
 * biggest difference is that elements are no longer removed and this saves much
 * time allocating and freeing during the solve phase.
 */
template <enabled_code code_to_enable = enabled_code::all>
Elm* getelm(SparseObj* so, unsigned row, unsigned col, Elm* new_elem) {
    Elm *el, *elnext;

    unsigned vrow = so->varord[row];
    unsigned vcol = so->varord[col];

    if (vrow == vcol) {
        return so->diag[vrow]; /* a common case */
    }
    if (vrow > vcol) { /* in the lower triangle */
        /* search downward from diag[vcol] */
        for (el = so->diag[vcol];; el = elnext) {
            elnext = el->r_down;
            if (!elnext) {
                break;
            } else if (elnext->row == row) { /* found it */
                return elnext;
            } else if (so->varord[elnext->row] > vrow) {
                break;
            }
        }
        /* insert below el */
        if (!new_elem) {
            if constexpr (code_to_enable == enabled_code::compute_only) {
                // Dynamic allocation should not happen during the compute phase.
                assert(false);
            } else {
                new_elem = new Elm{};
                new_elem->value = new double[so->_cntml_padded];
                increase_order(so, row);
            }
        }
        new_elem->r_down = el->r_down;
        el->r_down = new_elem;
        new_elem->r_up = el;
        if (new_elem->r_down) {
            new_elem->r_down->r_up = new_elem;
        }
        /* search leftward from diag[vrow] */
        for (el = so->diag[vrow];; el = elnext) {
            elnext = el->c_left;
            if (!elnext) {
                break;
            } else if (so->varord[elnext->col] < vcol) {
                break;
            }
        }
        /* insert to left of el */
        new_elem->c_left = el->c_left;
        el->c_left = new_elem;
        new_elem->c_right = el;
        if (new_elem->c_left) {
            new_elem->c_left->c_right = new_elem;
        } else {
            so->rowst[vrow] = new_elem;
        }
    } else { /* in the upper triangle */
        /* search upward from diag[vcol] */
        for (el = so->diag[vcol];; el = elnext) {
            elnext = el->r_up;
            if (!elnext) {
                break;
            } else if (elnext->row == row) { /* found it */
                return elnext;
            } else if (so->varord[elnext->row] < vrow) {
                break;
            }
        }
        /* insert above el */
        if (!new_elem) {
            if constexpr (code_to_enable == enabled_code::compute_only) {
                assert(false);
            } else {
                new_elem = new Elm{};
                new_elem->value = new double[so->_cntml_padded];
                increase_order(so, row);
            }
        }
        new_elem->r_up = el->r_up;
        el->r_up = new_elem;
        new_elem->r_down = el;
        if (new_elem->r_up) {
            new_elem->r_up->r_down = new_elem;
        }
        /* search right from diag[vrow] */
        for (el = so->diag[vrow];; el = elnext) {
            elnext = el->c_right;
            if (!elnext) {
                break;
            } else if (so->varord[elnext->col] > vcol) {
                break;
            }
        }
        /* insert to right of el */
        new_elem->c_right = el->c_right;
        el->c_right = new_elem;
        new_elem->c_left = el;
        if (new_elem->c_right) {
            new_elem->c_right->c_left = new_elem;
        }
    }
    new_elem->row = row;
    new_elem->col = col;
    return new_elem;
}

/**
 * The following routines support the concept of a list. Modified from modl. The
 * list is a doubly linked list. A special item with element 0 is always at the
 * tail of the list and is denoted as the List pointer itself. list->next point
 * to the first item in the list and list->prev points to the last item in the
 * list. i.e. the list is circular. Note that in an empty list next and prev
 * points to itself.
 *
 * It is intended that this implementation be hidden from the user via the
 * following function calls.
 */
inline List* newlist() {
    auto* ii = new Item{};
    ii->prev = ii;
    ii->next = ii;
    return ii;
}

/*free the list but not the elements*/
inline void freelist(List* list) {
    Item* i2;
    for (Item* i1 = list->next; i1 != list; i1 = i2) {
        i2 = i1->next;
        delete i1;
    }
    delete list;
}

inline void check_assert(SparseObj* so) {
    /* check that all links are consistent */
    for (unsigned i = 1; i <= so->neqn; i++) {
        assert(so->diag[i]);
        assert(so->diag[i]->row == so->diag[i]->col);
        assert(so->varord[so->diag[i]->row] == i);
        assert(so->rowst[i]->row == so->diag[i]->row);
        for (Elm* el = so->rowst[i]; el; el = el->c_right) {
            if (el == so->rowst[i]) {
                assert(el->c_left == nullptr);
            } else {
                assert(el->c_left->c_right == el);
                assert(so->varord[el->c_left->col] < so->varord[el->col]);
            }
        }
        for (Elm* el = so->diag[i]->r_down; el; el = el->r_down) {
            assert(el->r_up->r_down == el);
            assert(so->varord[el->r_up->row] < so->varord[el->row]);
        }
        for (Elm* el = so->diag[i]->r_up; el; el = el->r_up) {
            assert(el->r_down->r_up == el);
            assert(so->varord[el->r_down->row] > so->varord[el->row]);
        }
    }
}

/* at this point row links are out of order for diag[i]->col
   and col links are out of order for diag[i]->row */
inline void re_link(SparseObj* so, unsigned i) {
    for (Elm* el = so->rowst[i]; el; el = el->c_right) {
        /* repair hole */
        if (el->r_up)
            el->r_up->r_down = el->r_down;
        if (el->r_down)
            el->r_down->r_up = el->r_up;
    }

    for (Elm* el = so->diag[i]->r_down; el; el = el->r_down) {
        /* repair hole */
        if (el->c_right)
            el->c_right->c_left = el->c_left;
        if (el->c_left)
            el->c_left->c_right = el->c_right;
        else
            so->rowst[so->varord[el->row]] = el->c_right;
    }

    for (Elm* el = so->diag[i]->r_up; el; el = el->r_up) {
        /* repair hole */
        if (el->c_right)
            el->c_right->c_left = el->c_left;
        if (el->c_left)
            el->c_left->c_right = el->c_right;
        else
            so->rowst[so->varord[el->row]] = el->c_right;
    }

    /* matrix is consistent except that diagonal row elements are unlinked from
    their columns and the diagonal column elements are unlinked from their
    rows.
    For simplicity discard all knowledge of links and use getelm to relink
    */
    Elm *dright, *dleft, *dup, *ddown, *elnext;

    so->rowst[i] = so->diag[i];
    dright = so->diag[i]->c_right;
    dleft = so->diag[i]->c_left;
    dup = so->diag[i]->r_up;
    ddown = so->diag[i]->r_down;
    so->diag[i]->c_right = so->diag[i]->c_left = nullptr;
    so->diag[i]->r_up = so->diag[i]->r_down = nullptr;
    for (Elm* el = dright; el; el = elnext) {
        elnext = el->c_right;
        getelm(so, el->row, el->col, el);
    }
    for (Elm* el = dleft; el; el = elnext) {
        elnext = el->c_left;
        getelm(so, el->row, el->col, el);
    }
    for (Elm* el = dup; el; el = elnext) {
        elnext = el->r_up;
        getelm(so, el->row, el->col, el);
    }
    for (Elm* el = ddown; el; el = elnext) {
        elnext = el->r_down;
        getelm(so, el->row, el->col, el);
    }
}

inline void free_elm(SparseObj* so) {
    /* free all elements */
    for (unsigned i = 1; i <= so->neqn; i++) {
        so->rowst[i] = nullptr;
        so->diag[i] = nullptr;
    }
}

inline void init_minorder(SparseObj* so) {
    /* matrix has been set up. Construct the orderlist and orderfind
       vector.
    */

    so->do_flag = 1;
    if (so->roworder) {
        for (unsigned i = 1; i <= so->nroworder; ++i) {
            delete so->roworder[i];
        }
        delete[] so->roworder;
    }
    so->roworder = new Item* [so->neqn + 1] {};
    so->nroworder = so->neqn;
    if (so->orderlist) {
        freelist(so->orderlist);
    }
    so->orderlist = newlist();
    for (unsigned i = 1; i <= so->neqn; i++) {
        so->roworder[i] = new Item{};
    }
    for (unsigned i = 1; i <= so->neqn; i++) {
        unsigned j = 0;
        for (auto el = so->rowst[i]; el; el = el->c_right) {
            j++;
        }
        so->roworder[so->diag[i]->row]->elm = so->diag[i];
        so->roworder[so->diag[i]->row]->norder = j;
        insert(so, so->roworder[so->diag[i]->row]);
    }
}

inline void reduce_order(SparseObj* so, unsigned row) {
    /* order of row decreases by 1. Maintain the orderlist. */

    if (!so->do_flag)
        return;
    Item* order = so->roworder[row];
    delete_item(order);
    order->norder--;
    insert(so, order);
}

inline void get_next_pivot(SparseObj* so, unsigned i) {
    /* get varord[i], etc. from the head of the orderlist. */
    Item* order = so->orderlist->next;
    assert(order != so->orderlist);

    unsigned j;
    if ((j = so->varord[order->elm->row]) != i) {
        /* push order lists down by 1 and put new diag in empty slot */
        assert(j > i);
        Elm* el = so->rowst[j];
        for (; j > i; j--) {
            so->diag[j] = so->diag[j - 1];
            so->rowst[j] = so->rowst[j - 1];
            so->varord[so->diag[j]->row] = j;
        }
        so->diag[i] = order->elm;
        so->rowst[i] = el;
        so->varord[so->diag[i]->row] = i;
        /* at this point row links are out of order for diag[i]->col
           and col links are out of order for diag[i]->row */
        re_link(so, i);
    }

    /* now make sure all needed elements exist */
    for (Elm* el = so->diag[i]->r_down; el; el = el->r_down) {
        for (Elm* pivot = so->diag[i]->c_right; pivot; pivot = pivot->c_right) {
            getelm(so, el->row, pivot->col, nullptr);
        }
        reduce_order(so, el->row);
    }
    delete_item(order);
}

/* reallocate space for matrix */
inline void initeqn(SparseObj* so, unsigned maxeqn) {
    if (maxeqn == so->neqn)
        return;
    free_elm(so);
    so->neqn = maxeqn;
    delete[] so->rowst;
    delete[] so->diag;
    delete[] so->varord;
    delete[] so->rhs;
    delete[] so->ngetcall;
    so->elmpool = nullptr;
    so->rowst = new Elm*[maxeqn + 1];
    so->diag = new Elm*[maxeqn + 1];
    so->varord = new unsigned[maxeqn + 1];
    so->rhs = new double[(maxeqn + 1) * so->_cntml_padded];
    so->ngetcall = new unsigned[so->_cntml_padded];
    for (unsigned i = 1; i <= maxeqn; i++) {
        so->varord[i] = i;
        so->diag[i] = new Elm{};
        so->diag[i]->value = new double[so->_cntml_padded];
        so->rowst[i] = so->diag[i];
        so->diag[i]->row = i;
        so->diag[i]->col = i;
        so->diag[i]->r_down = so->diag[i]->r_up = nullptr;
        so->diag[i]->c_right = so->diag[i]->c_left = nullptr;
    }
    unsigned nn = so->neqn * so->_cntml_padded;
    for (unsigned i = 0; i < nn; ++i) {
        so->rhs[i] = 0.;
    }
}

/**
 * Minimum ordering algorithm to determine the order that the matrix should be
 * solved. Also make sure all needed elements are present. This does not mess up
 * the matrix.
 */
inline void spar_minorder(SparseObj* so) {
    check_assert(so);
    init_minorder(so);
    for (unsigned i = 1; i <= so->neqn; i++) {
        get_next_pivot(so, i);
    }
    so->do_flag = 0;
    check_assert(so);
}

inline void init_coef_list(SparseObj* so, int _iml) {
    so->ngetcall[_iml] = 0;
    for (unsigned i = 1; i <= so->neqn; i++) {
        for (Elm* el = so->rowst[i]; el; el = el->c_right) {
            el->value[_iml] = 0.;
        }
    }
}

#if defined(scopmath_sparse_d) || defined(scopmath_sparse_ix) || defined(scopmath_sparse_s) || \
    defined(scopmath_sparse_x)
#error "naming clash on sparse_thread.hpp-internal macros"
#endif
#define scopmath_sparse_ix(arg) ((arg) *_STRIDE)
inline void subrow(SparseObj* so, Elm* pivot, Elm* rowsub, int _iml) {
    unsigned int const _cntml_padded{so->_cntml_padded};
    double const r{rowsub->value[_iml] / pivot->value[_iml]};
    so->rhs[scopmath_sparse_ix(rowsub->row)] -= so->rhs[scopmath_sparse_ix(pivot->row)] * r;
    so->numop++;
    for (auto el = pivot->c_right; el; el = el->c_right) {
        for (rowsub = rowsub->c_right; rowsub->col != el->col; rowsub = rowsub->c_right) {
        }
        rowsub->value[_iml] -= el->value[_iml] * r;
        so->numop++;
    }
}

inline void bksub(SparseObj* so, int _iml) {
    int _cntml_padded = so->_cntml_padded;
    for (unsigned i = so->neqn; i >= 1; i--) {
        for (Elm* el = so->diag[i]->c_right; el; el = el->c_right) {
            so->rhs[scopmath_sparse_ix(el->row)] -= el->value[_iml] *
                                                    so->rhs[scopmath_sparse_ix(el->col)];
            so->numop++;
        }
        so->rhs[scopmath_sparse_ix(so->diag[i]->row)] /= so->diag[i]->value[_iml];
        so->numop++;
    }
}

inline int matsol(SparseObj* so, int _iml) {
    /* Upper triangularization */
    so->numop = 0;
    for (unsigned i = 1; i <= so->neqn; i++) {
        Elm* pivot{so->diag[i]};
        if (fabs(pivot->value[_iml]) <= ROUNDOFF) {
            return SINGULAR;
        }
        // Eliminate all elements in pivot column. The OpenACC annotation here
        // is to avoid problems with nvc++'s automatic paralellisation; see:
        // https://forums.developer.nvidia.com/t/device-kernel-hangs-at-o-and-above/212733
        nrn_pragma_acc(loop seq)
        for (auto el = pivot->r_down; el; el = el->r_down) {
            subrow(so, pivot, el, _iml);
        }
    }
    bksub(so, _iml);
    return SUCCESS;
}

template <typename SPFUN>
void create_coef_list(SparseObj* so, int n, SPFUN fun, _threadargsproto_) {
    initeqn(so, (unsigned) n);
    so->phase = 1;
    so->ngetcall[0] = 0;
    fun(so, so->rhs, _threadargs_);  // std::invoke in C++17
    if (so->coef_list) {
        free(so->coef_list);
    }
    so->coef_list_size = so->ngetcall[0];
    so->coef_list = new double*[so->coef_list_size];
    spar_minorder(so);
    so->phase = 2;
    so->ngetcall[0] = 0;
    fun(so, so->rhs, _threadargs_);  // std::invoke in C++17
    so->phase = 0;
}

template <enabled_code code_to_enable = enabled_code::all>
double* thread_getelm(SparseObj* so, int row, int col, int _iml) {
    if (!so->phase) {
        return so->coef_list[so->ngetcall[_iml]++];
    }
    Elm* el = scopmath::sparse::getelm<code_to_enable>(so, (unsigned) row, (unsigned) col, nullptr);
    if (so->phase == 1) {
        so->ngetcall[_iml]++;
    } else {
        so->coef_list[so->ngetcall[_iml]++] = el->value;
    }
    return el->value;
}
}  // namespace sparse
}  // namespace scopmath

// Methods that may be called from translated MOD files are kept outside the
// scopmath::sparse namespace.
#define scopmath_sparse_s(arg) _p[scopmath_sparse_ix(s[arg])]
#define scopmath_sparse_d(arg) _p[scopmath_sparse_ix(d[arg])]

/**
 * sparse matrix dynamic allocation: create_coef_list makes a list for fast
 * setup, does minimum ordering and ensures all elements needed are present.
 * This could easily be made recursive but it isn't right now.
 */
template <typename SPFUN>
void* nrn_cons_sparseobj(SPFUN fun, int n, Memb_list* ml, _threadargsproto_) {
    // fill in the unset _threadargsproto_ assuming _iml = 0;
    _iml = 0; /* from _threadargsproto_ */
    _p = ml->data;
    _ppvar = ml->pdata;
    _v = _nt->_actual_v[ml->nodeindices[_iml]];
    SparseObj* so{new SparseObj};
    so->_cntml_padded = _cntml_padded;
    scopmath::sparse::create_coef_list(so, n, fun, _threadargs_);
    nrn_sparseobj_copyto_device(so);
    return so;
}

/**
 * This is an experimental numerical method for SCoP-3 which integrates kinetic
 * rate equations.  It is intended to be used only by models generated by MODL,
 * and its identity is meant to be concealed from the user.
 *
 * @param n number of state variables
 * @param s array of pointers to the state variables
 * @param d array of pointers to the derivatives of states
 * @param t pointer to the independent variable
 * @param dt the time step
 * @param fun callable corresponding to the kinetic block equations
 * @param prhs pointer to right hand side vector (answer on return) does not
 *             have to be allocated by caller. (this is no longer quite right)
 * @param linflag solve as linear equations, when nonlinear, all states are
 *                forced >= 0
 */
template <typename F>
int sparse_thread(SparseObj* so,
                  int n,
                  int* s,
                  int* d,
                  double* t,
                  double dt,
                  F fun,
                  int linflag,
                  _threadargsproto_) {
    int i, j, ierr;
    double err;

    for (i = 0; i < n; i++) { /*save old state*/
        scopmath_sparse_d(i) = scopmath_sparse_s(i);
    }
    for (err = 1, j = 0; err > CONVERGE; j++) {
        scopmath::sparse::init_coef_list(so, _iml);
        fun(so, so->rhs, _threadargs_);  // std::invoke in C++17
        if ((ierr = scopmath::sparse::matsol(so, _iml))) {
            return ierr;
        }
        for (err = 0., i = 1; i <= n; i++) { /* why oh why did I write it from 1 */
            scopmath_sparse_s(i - 1) += so->rhs[scopmath_sparse_ix(i)];
            if (!linflag && scopmath_sparse_s(i - 1) < 0.) {
                scopmath_sparse_s(i - 1) = 0.;
            }
            err += fabs(so->rhs[scopmath_sparse_ix(i)]);
        }
        if (j > MAXSTEPS) {
            return EXCEED_ITERS;
        }
        if (linflag)
            break;
    }
    scopmath::sparse::init_coef_list(so, _iml);
    fun(so, so->rhs, _threadargs_);  // std::invoke in C++17
    for (i = 0; i < n; i++) {        /*restore Dstate at t+dt*/
        scopmath_sparse_d(i) = (scopmath_sparse_s(i) - scopmath_sparse_d(i)) / dt;
    }
    return SUCCESS;
}
#undef scopmath_sparse_d
#undef scopmath_sparse_ix
#undef scopmath_sparse_s
#define scopmath_sparse_x(arg) _p[x[arg] * _STRIDE]
/* for solving ax=b */
template <typename SPFUN>
int _cvode_sparse_thread(void** vpr, int n, int* x, SPFUN fun, _threadargsproto_) {
    SparseObj* so = (SparseObj*) (*vpr);
    if (!so) {
        so = new SparseObj{};
        *vpr = so;
    }
    scopmath::sparse::create_coef_list(so, n, fun, _threadargs_); /* calls fun twice */
    scopmath::sparse::init_coef_list(so, _iml);
    fun(so, so->rhs, _threadargs_);  // std::invoke in C++17
    int ierr;
    if ((ierr = scopmath::sparse::matsol(so, _iml))) {
        return ierr;
    }
    for (int i = 1; i <= n; i++) { /* why oh why did I write it from 1 */
        scopmath_sparse_x(i - 1) = so->rhs[i];
    }
    return SUCCESS;
}
#undef scopmath_sparse_x

inline void _nrn_destroy_sparseobj_thread(SparseObj* so) {
    if (!so) {
        return;
    }
    nrn_sparseobj_delete_from_device(so);
    delete[] so->rowst;
    delete[] so->diag;
    delete[] so->varord;
    delete[] so->rhs;
    delete[] so->coef_list;
    if (so->roworder) {
        for (int ii = 1; ii <= so->nroworder; ++ii) {
            delete so->roworder[ii];
        }
        delete[] so->roworder;
    }
    if (so->orderlist) {
        scopmath::sparse::freelist(so->orderlist);
    }
    delete so;
}
}  // namespace coreneuron


================================================
FILE: coreneuron/sim/scopmath/ssimplic_thread.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/
#pragma once
#include "coreneuron/mechanism/mech/mod2c_core_thread.hpp"

namespace coreneuron {

#if defined(scopmath_ssimplic_s)
#error "naming clash on ssimplic_thread.hpp-internal macros"
#endif
#define scopmath_ssimplic_s(arg) _p[s[arg] * _STRIDE]
static int check_state(int n, int* s, _threadargsproto_) {
    bool flag{true};
    for (int i = 0; i < n; i++) {
        if (scopmath_ssimplic_s(i) < -1e-6) {
            scopmath_ssimplic_s(i) = 0.;
            flag = false;
        }
    }
    return flag;
}
#undef scopmath_ssimplic_s

template <typename SPFUN>
int _ss_sparse_thread(SparseObj* so,
                      int n,
                      int* s,
                      int* d,
                      double* t,
                      double dt,
                      SPFUN fun,
                      int linflag,
                      _threadargsproto_) {
    int err;
    double ss_dt{1e9};
    _nt->_dt = ss_dt;

    if (linflag) { /*iterate linear solution*/
        err = sparse_thread(so, n, s, d, t, ss_dt, fun, 0, _threadargs_);
    } else {
        int ii{7};
        err = 0;
        while (ii) {
            err = sparse_thread(so, n, s, d, t, ss_dt, fun, 1, _threadargs_);
            if (!err) {
                if (check_state(n, s, _threadargs_)) {
                    err = sparse_thread(so, n, s, d, t, ss_dt, fun, 0, _threadargs_);
                }
            }
            --ii;
            if (!err) {
                ii = 0;
            }
        }
    }

    _nt->_dt = dt;
    return err;
}

template <typename DIFUN>
int _ss_derivimplicit_thread(int n, int* slist, int* dlist, DIFUN fun, _threadargsproto_) {
    double const dtsav{_nt->_dt};
    _nt->_dt = 1e-9;
    int err = fun(_threadargs_);
    _nt->_dt = dtsav;
    return err;
}
}  // namespace coreneuron


================================================
FILE: coreneuron/sim/solve_core.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#include "coreneuron/nrnconf.h"
#include "coreneuron/permute/cellorder.hpp"
#include "coreneuron/sim/multicore.hpp"
namespace coreneuron {
bool use_solve_interleave;

static void triang(NrnThread*), bksub(NrnThread*);

/* solve the matrix equation */
void nrn_solve_minimal(NrnThread* _nt) {
    if (use_solve_interleave) {
        solve_interleaved(_nt->id);
    } else {
        triang(_nt);
        bksub(_nt);
    }
}

/** @todo OpenACC GPU offload is sequential/slow. Because --cell-permute=0 and
 *  --gpu is forbidden anyway, no OpenMP target offload equivalent is implemented.
 */

/* triangularization of the matrix equations */
static void triang(NrnThread* _nt) {
    int i2 = _nt->ncell;
    int i3 = _nt->end;

    double* vec_a = &(VEC_A(0));
    double* vec_b = &(VEC_B(0));
    double* vec_d = &(VEC_D(0));
    double* vec_rhs = &(VEC_RHS(0));
    int* parent_index = _nt->_v_parent_index;

    nrn_pragma_acc(parallel loop seq present(
        vec_a [0:i3], vec_b [0:i3], vec_d [0:i3], vec_rhs [0:i3], parent_index [0:i3])
                       async(_nt->stream_id) if (_nt->compute_gpu))
    nrn_pragma_omp(target if (_nt->compute_gpu))
    for (int i = i3 - 1; i >= i2; --i) {
        double p = vec_a[i] / vec_d[i];
        vec_d[parent_index[i]] -= p * vec_b[i];
        vec_rhs[parent_index[i]] -= p * vec_rhs[i];
    }
}

/* back substitution to finish solving the matrix equations */
static void bksub(NrnThread* _nt) {
    int i1 = 0;
    int i2 = i1 + _nt->ncell;
    int i3 = _nt->end;

    double* vec_b = &(VEC_B(0));
    double* vec_d = &(VEC_D(0));
    double* vec_rhs = &(VEC_RHS(0));
    int* parent_index = _nt->_v_parent_index;

    nrn_pragma_acc(parallel loop seq present(vec_d [0:i2], vec_rhs [0:i2])
                       async(_nt->stream_id) if (_nt->compute_gpu))
    nrn_pragma_omp(target if (_nt->compute_gpu))
    for (int i = i1; i < i2; ++i) {
        vec_rhs[i] /= vec_d[i];
    }

    nrn_pragma_acc(
        parallel loop seq present(vec_b [0:i3], vec_d [0:i3], vec_rhs [0:i3], parent_index [0:i3])
            async(_nt->stream_id) if (_nt->compute_gpu))
    nrn_pragma_omp(target if (_nt->compute_gpu))
    for (int i = i2; i < i3; ++i) {
        vec_rhs[i] -= vec_b[i] * vec_rhs[parent_index[i]];
        vec_rhs[i] /= vec_d[i];
    }

    if (_nt->compute_gpu) {
        nrn_pragma_acc(wait(_nt->stream_id))
    }
}
}  // namespace coreneuron


================================================
FILE: coreneuron/sim/treeset_core.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#include <string>

#include "coreneuron/nrnconf.h"
#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/utils/profile/profiler_interface.h"
#include "coreneuron/coreneuron.hpp"
#include "coreneuron/utils/nrnoc_aux.hpp"

namespace coreneuron {
/*
Fixed step method with threads and cache efficiency. No extracellular,
sparse matrix, multisplit, or legacy features.
*/

static void nrn_rhs(NrnThread* _nt) {
    int i1 = 0;
    int i2 = i1 + _nt->ncell;
    int i3 = _nt->end;

    double* vec_rhs = &(VEC_RHS(0));
    double* vec_d = &(VEC_D(0));
    double* vec_a = &(VEC_A(0));
    double* vec_b = &(VEC_B(0));
    double* vec_v = &(VEC_V(0));
    int* parent_index = _nt->_v_parent_index;

    nrn_pragma_acc(parallel loop present(vec_rhs [0:i3], vec_d [0:i3]) if (_nt->compute_gpu)
                       async(_nt->stream_id))
    nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
    for (int i = i1; i < i3; ++i) {
        vec_rhs[i] = 0.;
        vec_d[i] = 0.;
    }

    if (_nt->nrn_fast_imem) {
        double* fast_imem_d = _nt->nrn_fast_imem->nrn_sav_d;
        double* fast_imem_rhs = _nt->nrn_fast_imem->nrn_sav_rhs;
        nrn_pragma_acc(
            parallel loop present(fast_imem_d [i1:i3], fast_imem_rhs [i1:i3]) if (_nt->compute_gpu)
                async(_nt->stream_id))
        nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
        for (int i = i1; i < i3; ++i) {
            fast_imem_d[i] = 0.;
            fast_imem_rhs[i] = 0.;
        }
    }

    nrn_ba(_nt, BEFORE_BREAKPOINT);
    /* note that CAP has no current */
    for (auto tml = _nt->tml; tml; tml = tml->next)
        if (corenrn.get_memb_func(tml->index).current) {
            mod_f_t s = corenrn.get_memb_func(tml->index).current;
            std::string ss("cur-");
            ss += nrn_get_mechname(tml->index);
            Instrumentor::phase p(ss.c_str());
            (*s)(_nt, tml->ml, tml->index);
#ifdef DEBUG
            if (errno) {
                hoc_warning("errno set during calculation of currents", nullptr);
            }
#endif
        }

    if (_nt->nrn_fast_imem) {
        /* _nrn_save_rhs has only the contribution of electrode current
           so here we transform so it only has membrane current contribution
        */
        double* p = _nt->nrn_fast_imem->nrn_sav_rhs;
        nrn_pragma_acc(parallel loop present(p, vec_rhs) if (_nt->compute_gpu)
                           async(_nt->stream_id))
        nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
        for (int i = i1; i < i3; ++i) {
            p[i] -= vec_rhs[i];
        }
    }

    /* now the internal axial currents.
    The extracellular mechanism contribution is already done.
            rhs += ai_j*(vi_j - vi)
    */
    nrn_pragma_acc(parallel loop present(vec_rhs [0:i3],
                                         vec_d [0:i3],
                                         vec_a [0:i3],
                                         vec_b [0:i3],
                                         vec_v [0:i3],
                                         parent_index [0:i3]) if (_nt->compute_gpu)
                       async(_nt->stream_id))
    nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
    for (int i = i2; i < i3; ++i) {
        double dv = vec_v[parent_index[i]] - vec_v[i];
        /* our connection coefficients are negative so */
        nrn_pragma_acc(atomic update)
        nrn_pragma_omp(atomic update)
        vec_rhs[i] -= vec_b[i] * dv;
        nrn_pragma_acc(atomic update)
        nrn_pragma_omp(atomic update)
        vec_rhs[parent_index[i]] += vec_a[i] * dv;
    }
}

/* calculate left hand side of
cm*dvm/dt = -i(vm) + is(vi) + ai_j*(vi_j - vi)
cx*dvx/dt - cm*dvm/dt = -gx*(vx - ex) + i(vm) + ax_j*(vx_j - vx)
with a matrix so that the solution is of the form [dvm+dvx,dvx] on the right
hand side after solving.
This is a common operation for fixed step, cvode, and daspk methods
*/

static void nrn_lhs(NrnThread* _nt) {
    int i1 = 0;
    int i2 = i1 + _nt->ncell;
    int i3 = _nt->end;

    /* note that CAP has no jacob */
    for (auto tml = _nt->tml; tml; tml = tml->next)
        if (corenrn.get_memb_func(tml->index).jacob) {
            mod_f_t s = corenrn.get_memb_func(tml->index).jacob;
            std::string ss("cur-");
            ss += nrn_get_mechname(tml->index);
            Instrumentor::phase p(ss.c_str());
            (*s)(_nt, tml->ml, tml->index);
#ifdef DEBUG
            if (errno) {
                hoc_warning("errno set during calculation of jacobian", (char*) 0);
            }
#endif
        }
    /* now the cap current can be computed because any change to cm by another model
    has taken effect
    */
    /* note, the first is CAP if there are any nodes*/
    if (_nt->end && _nt->tml) {
        assert(_nt->tml->index == CAP);
        nrn_jacob_capacitance(_nt, _nt->tml->ml, _nt->tml->index);
    }

    double* vec_d = &(VEC_D(0));
    double* vec_a = &(VEC_A(0));
    double* vec_b = &(VEC_B(0));
    int* parent_index = _nt->_v_parent_index;

    if (_nt->nrn_fast_imem) {
        /* _nrn_save_d has only the contribution of electrode current
           so here we transform so it only has membrane current contribution
        */
        double* p = _nt->nrn_fast_imem->nrn_sav_d;
        nrn_pragma_acc(parallel loop present(p, vec_d) if (_nt->compute_gpu) async(_nt->stream_id))
        nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
        for (int i = i1; i < i3; ++i) {
            p[i] += vec_d[i];
        }
    }

    /* now add the axial currents */
    nrn_pragma_acc(parallel loop present(
        vec_d [0:i3], vec_a [0:i3], vec_b [0:i3], parent_index [0:i3]) if (_nt->compute_gpu)
                       async(_nt->stream_id))
    nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
    for (int i = i2; i < i3; ++i) {
        nrn_pragma_acc(atomic update)
        nrn_pragma_omp(atomic update)
        vec_d[i] -= vec_b[i];
        nrn_pragma_acc(atomic update)
        nrn_pragma_omp(atomic update)
        vec_d[parent_index[i]] -= vec_a[i];
    }
}

/* for the fixed step method */
void* setup_tree_matrix_minimal(NrnThread* _nt) {
    nrn_rhs(_nt);
    nrn_lhs(_nt);
    return nullptr;
}
}  // namespace coreneuron


================================================
FILE: coreneuron/utils/ivocvect.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#include "coreneuron/utils/ivocvect.hpp"
#include "coreneuron/utils/offload.hpp"

namespace coreneuron {
IvocVect* vector_new(int n) {
    return new IvocVect(n);
}
int vector_capacity(IvocVect* v) {
    return v->size();
}
double* vector_vec(IvocVect* v) {
    return v->data();
}

/*
 * Retro-compatibility implementations
 */
IvocVect* vector_new1(int n) {
    return new IvocVect(n);
}

nrn_pragma_acc(routine seq)
int vector_capacity(void* v) {
    return ((IvocVect*) v)->size();
}

nrn_pragma_acc(routine seq)
double* vector_vec(void* v) {
    return ((IvocVect*) v)->data();
}

}  // namespace coreneuron


================================================
FILE: coreneuron/utils/ivocvect.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#pragma once

#include "coreneuron/utils/offload.hpp"

#include <cstdio>
#include <utility>

namespace coreneuron {
template <typename T>
class fixed_vector {
    size_t n_;

  public:
    T* data_; /*making public for openacc copying */

    fixed_vector() = default;

    fixed_vector(size_t n)
        : n_(n) {
        data_ = new T[n_];
    }

    fixed_vector(const fixed_vector& vec) = delete;
    fixed_vector& operator=(const fixed_vector& vec) = delete;
    fixed_vector(fixed_vector&& vec)
        : n_{vec.n_}
        , data_{nullptr} {
        std::swap(data_, vec.data_);
    }
    fixed_vector& operator=(fixed_vector&& vec) {
        data_ = nullptr;
        std::swap(data_, vec.data_);
        n_ = vec.n_;
        return *this;
    }

    ~fixed_vector() {
        delete[] data_;
    }

    const T& operator[](int i) const {
        return data_[i];
    }
    T& operator[](int i) {
        return data_[i];
    }

    nrn_pragma_acc(routine seq)
    const T* data(void) const {
        return data_;
    }

    nrn_pragma_acc(routine seq)
    T* data(void) {
        return data_;
    }

    nrn_pragma_acc(routine seq)
    size_t size() const {
        return n_;
    }
};

using IvocVect = fixed_vector<double>;

extern IvocVect* vector_new(int n);
extern int vector_capacity(IvocVect* v);
extern double* vector_vec(IvocVect* v);

// retro-compatibility API
extern IvocVect* vector_new1(int n);
nrn_pragma_acc(routine seq)
extern int vector_capacity(void* v);
nrn_pragma_acc(routine seq)
extern double* vector_vec(void* v);

}  // namespace coreneuron


================================================
FILE: coreneuron/utils/lpt.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#include <algorithm>
#include <functional>
#include <numeric>
#include <queue>

#include "coreneuron/nrnconf.h"  // for size_t
#include "coreneuron/utils/lpt.hpp"
#include "coreneuron/utils/nrn_assert.h"

using P = std::pair<size_t, size_t>;

// lpt Least Processing Time algorithm.
// Largest piece goes into least size bag.
// in: number of bags, vector of sizes
// return: a new vector of bag indices parallel to the vector of sizes.

std::vector<std::size_t> lpt(std::size_t nbag, std::vector<std::size_t>& pieces, double* bal) {
    nrn_assert(nbag > 0);
    nrn_assert(!pieces.empty());

    std::vector<P> pvec;
    for (size_t i = 0; i < pieces.size(); ++i) {
        pvec.push_back(P(i, pieces[i]));
    }

    auto P_comp = [](const P& a, const P& b) { return a.second > b.second; };

    std::sort(pvec.begin(), pvec.end(), P_comp);

    std::vector<std::size_t> bagindices(pieces.size());

    std::priority_queue<P, std::vector<P>, decltype(P_comp)> bagq(P_comp);
    for (size_t i = 0; i < nbag; ++i) {
        bagq.push(P(i, 0));
    }

    for (const auto& p: pvec) {
        P bagqitem = bagq.top();
        bagq.pop();
        bagindices[p.first] = bagqitem.first;
        bagqitem.second += p.second;
        bagq.push(bagqitem);
    }

    // load balance average/max (1.0 is perfect)
    std::vector<size_t> v(bagq.size());
    for (size_t i = 1; i < nbag; ++i) {
        v[i] = bagq.top().second;
        bagq.pop();
    }
    double b = load_balance(v);
    if (bal) {
        *bal = b;
    } else {
        printf("load balance = %g for %ld pieces in %ld bags\n", b, pieces.size(), nbag);
    }

    return bagindices;
}

double load_balance(std::vector<size_t>& v) {
    nrn_assert(!v.empty());
    std::size_t sum = std::accumulate(v.begin(), v.end(), 0);
    std::size_t max = *std::max_element(v.begin(), v.end());
    return (double(sum) / v.size()) / max;
}


================================================
FILE: coreneuron/utils/lpt.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#pragma once

#include <vector>

std::vector<std::size_t> lpt(std::size_t nbag,
                             std::vector<std::size_t>& pieces,
                             double* bal = nullptr);

double load_balance(std::vector<size_t>&);


================================================
FILE: coreneuron/utils/memory.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/
#include "coreneuron/apps/corenrn_parameters.hpp"
#include "coreneuron/utils/memory.h"

#ifdef CORENEURON_ENABLE_GPU
#include <cuda_runtime_api.h>
#endif

#include <cassert>

namespace coreneuron {
bool gpu_enabled() {
#ifdef CORENEURON_ENABLE_GPU
    return corenrn_param.gpu;
#else
    return false;
#endif
}

void* allocate_unified(std::size_t num_bytes) {
#ifdef CORENEURON_ENABLE_GPU
    // The build supports GPU execution, check if --gpu was passed to actually
    // enable it. We should not call CUDA APIs in GPU builds if --gpu was not passed.
    if (corenrn_param.gpu) {
        // Allocate managed/unified memory.
        void* ptr{nullptr};
        auto const code = cudaMallocManaged(&ptr, num_bytes);
        assert(code == cudaSuccess);
        return ptr;
    }
#endif
    // Either the build does not have GPU support or --gpu was not passed.
    // Allocate using standard operator new.
    // When we have C++17 support then propagate `alignment` here.
    return ::operator new(num_bytes);
}

void deallocate_unified(void* ptr, std::size_t num_bytes) {
    // See comments in allocate_unified to understand the different branches.
#ifdef CORENEURON_ENABLE_GPU
    if (corenrn_param.gpu) {
        // Deallocate managed/unified memory.
        auto const code = cudaFree(ptr);
        assert(code == cudaSuccess);
        return;
    }
#endif
#ifdef __cpp_sized_deallocation
    ::operator delete(ptr, num_bytes);
#else
    ::operator delete(ptr);
#endif
}
}  // namespace coreneuron


================================================
FILE: coreneuron/utils/memory.h
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#pragma once

#include <cstdint>
#include <cstring>
#include <memory>

#include "coreneuron/utils/nrn_assert.h"
#include "coreneuron/nrniv/nrniv_decl.h"

#if !defined(NRN_SOA_BYTE_ALIGN)
// for layout 0, every range variable array must be aligned by at least 16 bytes (the size of the
// simd memory bus)
#define NRN_SOA_BYTE_ALIGN (8 * sizeof(double))
#endif

namespace coreneuron {
/**
 * @brief Check if GPU support is enabled.
 *
 * This returns true if GPU support was enabled at compile time and at runtime
 * via coreneuron.gpu = True and/or --gpu, otherwise it returns false.
 */
bool gpu_enabled();

/** @brief Allocate unified memory in GPU builds iff GPU enabled, otherwise new
 */
void* allocate_unified(std::size_t num_bytes);

/** @brief Deallocate memory allocated by `allocate_unified`.
 */
void deallocate_unified(void* ptr, std::size_t num_bytes);

/** @brief C++ allocator that uses [de]allocate_unified.
 */
template <typename T>
struct unified_allocator {
    using value_type = T;

    unified_allocator() = default;

    template <typename U>
    unified_allocator(unified_allocator<U> const&) noexcept {}

    value_type* allocate(std::size_t n) {
        return static_cast<value_type*>(allocate_unified(n * sizeof(value_type)));
    }

    void deallocate(value_type* p, std::size_t n) noexcept {
        deallocate_unified(p, n * sizeof(value_type));
    }
};

template <typename T, typename U>
bool operator==(unified_allocator<T> const&, unified_allocator<U> const&) noexcept {
    return true;
}

template <typename T, typename U>
bool operator!=(unified_allocator<T> const& x, unified_allocator<U> const& y) noexcept {
    return !(x == y);
}

/** @brief Allocator-aware deleter for use with std::unique_ptr.
 *
 *  This is copied from https://stackoverflow.com/a/23132307. See also
 *  http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0316r0.html,
 *  http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0211r3.html, and
 *  boost::allocate_unique<...>.
 *  Hopefully std::allocate_unique will be included in C++23.
 */
template <typename Alloc>
struct alloc_deleter {
    alloc_deleter() = default;  // OL210813 addition
    alloc_deleter(const Alloc& a)
        : a(a) {}

    using pointer = typename std::allocator_traits<Alloc>::pointer;

    void operator()(pointer p) const {
        Alloc aa(a);
        std::allocator_traits<Alloc>::destroy(aa, std::addressof(*p));
        std::allocator_traits<Alloc>::deallocate(aa, p, 1);
    }

  private:
    Alloc a;
};

template <typename T, typename Alloc, typename... Args>
auto allocate_unique(const Alloc& alloc, Args&&... args) {
    using AT = std::allocator_traits<Alloc>;
    static_assert(std::is_same<typename AT::value_type, std::remove_cv_t<T>>{}(),
                  "Allocator has the wrong value_type");

    Alloc a(alloc);
    auto p = AT::allocate(a, 1);
    try {
        AT::construct(a, std::addressof(*p), std::forward<Args>(args)...);
        using D = alloc_deleter<Alloc>;
        return std::unique_ptr<T, D>(p, D(a));
    } catch (...) {
        AT::deallocate(a, p, 1);
        throw;
    }
}
}  // namespace coreneuron

/// for gpu builds with unified memory support
#ifdef CORENEURON_UNIFIED_MEMORY

#include <cuda_runtime_api.h>

// TODO : error handling for CUDA routines
inline void alloc_memory(void*& pointer, size_t num_bytes, size_t /*alignment*/) {
    cudaMallocManaged(&pointer, num_bytes);
}

inline void calloc_memory(void*& pointer, size_t num_bytes, size_t /*alignment*/) {
    alloc_memory(pointer, num_bytes, 64);
    cudaMemset(pointer, 0, num_bytes);
}

inline void free_memory(void* pointer) {
    cudaFree(pointer);
}

/**
 * A base class providing overloaded new and delete operators for CUDA allocation
 *
 * Classes that should be allocated on the GPU should inherit from this class. Additionally they
 * may need to implement a special copy-construtor. This is documented here:
 * \link: https://devblogs.nvidia.com/unified-memory-in-cuda-6/
 */
class MemoryManaged {
  public:
    void* operator new(size_t len) {
        void* ptr;
        cudaMallocManaged(&ptr, len);
        cudaDeviceSynchronize();
        return ptr;
    }

    void* operator new[](size_t len) {
        void* ptr;
        cudaMallocManaged(&ptr, len);
        cudaDeviceSynchronize();
        return ptr;
    }

    void operator delete(void* ptr) {
        cudaDeviceSynchronize();
        cudaFree(ptr);
    }

    void operator delete[](void* ptr) {
        cudaDeviceSynchronize();
        cudaFree(ptr);
    }
};


/// for cpu builds use posix memalign
#else
class MemoryManaged {
    // does nothing by default
};

#include <cstdlib>

inline void alloc_memory(void*& pointer, size_t num_bytes, size_t alignment) {
    size_t fill = 0;
    if (alignment > 0) {
        if (num_bytes % alignment != 0) {
            size_t multiple = num_bytes / alignment;
            fill = alignment * (multiple + 1) - num_bytes;
        }
        nrn_assert((pointer = std::aligned_alloc(alignment, num_bytes + fill)) != nullptr);
    } else {
        nrn_assert((pointer = std::malloc(num_bytes)) != nullptr);
    }
}

inline void calloc_memory(void*& pointer, size_t num_bytes, size_t alignment) {
    alloc_memory(pointer, num_bytes, alignment);
    memset(pointer, 0, num_bytes);
}

inline void free_memory(void* pointer) {
    free(pointer);
}

#endif

namespace coreneuron {

/** Independent function to compute the needed chunkding,
    the chunk argument is the number of doubles the chunk is chunkded upon.
*/
template <int chunk>
inline int soa_padded_size(int cnt, int layout) {
    int imod = cnt % chunk;
    if (layout == Layout::AoS)
        return cnt;
    if (imod) {
        int idiv = cnt / chunk;
        return (idiv + 1) * chunk;
    }
    return cnt;
}

/** Check for the pointer alignment.
 */
inline bool is_aligned(void* pointer, std::size_t alignment) {
    return (reinterpret_cast<std::uintptr_t>(pointer) % alignment) == 0;
}

/**
 * Allocate aligned memory. This will be unified memory if the corresponding
 * CMake option is set. This must be freed with the free_memory method.
 *
 * \param size      Size of buffer to allocate in bytes.
 * \param alignment Memory alignment, defaults to NRN_SOA_BYTE_ALIGN. Pass 0 for no alignment.
 */
inline void* emalloc_align(size_t size, size_t alignment = NRN_SOA_BYTE_ALIGN) {
    void* memptr;
    alloc_memory(memptr, size, alignment);
    if (alignment != 0) {
        nrn_assert(is_aligned(memptr, alignment));
    }
    return memptr;
}

/**
 * Allocate the aligned memory and set it to 0. This will be unified memory if
 * the corresponding CMake option is set. This must be freed with the
 * free_memory method.
 *
 * \param n         Number of objects to allocate
 * \param size      Size of buffer for each object to allocate in bytes.
 * \param alignment Memory alignment, defaults to NRN_SOA_BYTE_ALIGN. Pass 0 for no alignment.
 *
 * \note the allocated size will be \code n*size
 */
inline void* ecalloc_align(size_t n, size_t size, size_t alignment = NRN_SOA_BYTE_ALIGN) {
    void* p;
    if (n == 0) {
        return nullptr;
    }
    calloc_memory(p, n * size, alignment);
    if (alignment != 0) {
        nrn_assert(is_aligned(p, alignment));
    }
    return p;
}
}  // namespace coreneuron


================================================
FILE: coreneuron/utils/memory_utils.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

/**
 * @file memory_utils.cpp
 * @date 25th Oct 2014
 *
 * @brief Provides functionality to report current memory usage
 * of the simulator using interface provided by malloc.h
 *
 * Memory utilisation report is based on the use of mallinfo
 * interface defined in malloc.h. For 64 bit platform, this
 * is not portable and hence it will be replaced with new
 * glibc implementation of malloc_info.
 *
 * @see http://man7.org/linux/man-pages/man3/malloc_info.3.html
 */

#include <stdio.h>
#include <fstream>
#include <unistd.h>
#include "coreneuron/utils/memory_utils.h"
#include "coreneuron/mpi/nrnmpi.h"
#include "coreneuron/mpi/core/nrnmpi.hpp"
#include "coreneuron/apps/corenrn_parameters.hpp"

#if defined(__APPLE__) && defined(__MACH__)
#include <mach/mach.h>
#elif defined HAVE_MALLOC_H
#include <malloc.h>
#endif

#ifdef CORENEURON_ENABLE_GPU
#include "cuda_profiler_api.h"
#endif

namespace coreneuron {
double nrn_mallinfo(void) {
    // -ve mem usage for non-supported platforms
    double mbs = -1.0;

// on os x returns the current resident set size (physical memory in use)
#if defined(__APPLE__) && defined(__MACH__)
    struct mach_task_basic_info info;
    mach_msg_type_number_t infoCount = MACH_TASK_BASIC_INFO_COUNT;
    if (task_info(mach_task_self(), MACH_TASK_BASIC_INFO, (task_info_t) &info, &infoCount) !=
        KERN_SUCCESS)
        return (size_t) 0L; /* Can't access? */
    return info.resident_size / (1024.0 * 1024.0);
#elif defined(MINGW)
    mbs = -1;
#else
    std::ifstream file("/proc/self/statm");
    if (file.is_open()) {
        unsigned long long int data_size;
        file >> data_size >> data_size;
        file.close();
        mbs = (data_size * sysconf(_SC_PAGESIZE)) / (1024.0 * 1024.0);
    } else {
#if defined HAVE_MALLOC_H
// The mallinfo2() function was added in glibc 2.33
#if defined(__GLIBC__) && (__GLIBC__ >= 2 && __GLIBC_MINOR__ >= 33)
        struct mallinfo2 m = mallinfo2();
#else
        struct mallinfo m = mallinfo();
#endif
        mbs = (m.hblkhd + m.uordblks) / (1024.0 * 1024.0);
#endif
    }
#endif
    return mbs;
}

void report_mem_usage(const char* message, bool all_ranks) {
    double mem_max, mem_min, mem_avg;  // min, max, avg memory

    // current memory usage on this rank
    double cur_mem = nrn_mallinfo();

/* @todo: avoid three all reduce class */
#if NRNMPI
    if (corenrn_param.mpi_enable) {
        mem_avg = nrnmpi_dbl_allreduce(cur_mem, 1) / nrnmpi_numprocs;
        mem_max = nrnmpi_dbl_allreduce(cur_mem, 2);
        mem_min = nrnmpi_dbl_allreduce(cur_mem, 3);
    } else
#endif
    {
        mem_avg = mem_max = mem_min = cur_mem;
    }

    // all ranks prints information if all_ranks is true
    if (all_ranks) {
        printf(" Memory (MBs) (Rank : %2d) : %30s : Cur %.4lf, Max %.4lf, Min %.4lf, Avg %.4lf \n",
               nrnmpi_myid,
               message,
               cur_mem,
               mem_max,
               mem_min,
               mem_avg);
    } else if (nrnmpi_myid == 0) {
        printf(" Memory (MBs) : %25s : Max %.4lf, Min %.4lf, Avg %.4lf \n",
               message,
               mem_max,
               mem_min,
               mem_avg);
#ifdef CORENEURON_ENABLE_GPU
        if (corenrn_param.gpu) {
            size_t free_byte, total_byte;
            cudaError_t cuda_status = cudaMemGetInfo(&free_byte, &total_byte);
            if (cudaSuccess != cuda_status) {
                std::printf("cudaMemGetInfo failed: %s\n", cudaGetErrorString(cuda_status));
            }
            constexpr double MiB{1. / (1024. * 1024.)};
            std::printf(" GPU Memory (MiBs) : Used = %f, Free = %f, Total = %f\n",
                        (total_byte - free_byte) * MiB,
                        free_byte * MiB,
                        total_byte * MiB);
        }
#endif
    }
    fflush(stdout);
}
}  // namespace coreneuron


================================================
FILE: coreneuron/utils/memory_utils.h
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

/**
 * @file memory_utils.h
 * @date 25th Oct 2014
 * @brief Function prototypes for the functions providing
 * information about simulator memory usage
 *
 */

#pragma once

namespace coreneuron {
/** @brief Reports current memory usage of the simulator to stdout
 *
 *  Current implementation is based on mallinfo. This routine prints
 *  min, max and avg memory usage across mpi comm world
 *  @param message string indicating current stage of the simulation
 *  @param all_ranks indicate whether to print info from all ranks
 *  @return Void
 */
void report_mem_usage(const char* message, bool all_ranks = false);

/** @brief Returns current memory usage in KBs
 *  @param Void
 *  @return memory usage in KBs
 */
double nrn_mallinfo(void);
}  // namespace coreneuron


================================================
FILE: coreneuron/utils/nrn_assert.h
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#pragma once

#include <cstdio>
#include <cstdlib>
#include <cstdarg>

/* Preserving original behaviour requires that we abort() on
 * parse failures.
 *
 * Relying on assert() (as in the original code) is fragile,
 * as this becomes a NOP if the source is compiled with
 * NDEBUG defined.
 */

/** Emit formatted message to stderr, then abort(). */
static void abortf(const char* fmt, ...) {
    va_list va;
    va_start(va, fmt);
    vfprintf(stderr, fmt, va);
    va_end(va);
    abort();
}

/** assert()-like macro, independent of NDEBUG status */
#define nrn_assert(x) \
    ((x) || (abortf("%s:%d: Assertion '%s' failed.\n", __FILE__, __LINE__, #x), 0))


================================================
FILE: coreneuron/utils/nrn_stats.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

/**
 * @file nrn_stats.cpp
 * @date 25th Dec 2014
 * @brief Function declarations for the cell statistics
 *
 */

#include <algorithm>
#include <cstdio>
#include <climits>
#include <vector>
#include "coreneuron/utils/nrn_stats.h"
#include "coreneuron/mpi/nrnmpi.h"
#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/network/netcvode.hpp"
#include "coreneuron/network/partrans.hpp"
#include "coreneuron/io/output_spikes.hpp"
#include "coreneuron/apps/corenrn_parameters.hpp"
namespace coreneuron {
const int NUM_STATS = 13;

void report_cell_stats() {
    long stat_array[NUM_STATS] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

    for (int ith = 0; ith < nrn_nthread; ++ith) {
        stat_array[0] += nrn_threads[ith].ncell;           // number of cells
        stat_array[10] += nrn_threads[ith].end;            // number of compartments
        stat_array[1] += nrn_threads[ith].n_presyn;        // number of presyns
        stat_array[2] += nrn_threads[ith].n_input_presyn;  // number of input presyns
        stat_array[3] += nrn_threads[ith].n_netcon;        // number of netcons, synapses
        stat_array[4] += nrn_threads[ith].n_pntproc;       // number of point processes
        if (nrn_partrans::transfer_thread_data_) {
            size_t n = nrn_partrans::transfer_thread_data_[ith].tar_indices.size();
            stat_array[11] += n;  // number of transfer targets
            n = nrn_partrans::transfer_thread_data_[ith].src_indices.size();
            stat_array[12] += n;  // number of transfer sources
        }
    }
    stat_array[5] = spikevec_gid.size();  // number of spikes

    stat_array[6] = std::count_if(spikevec_gid.cbegin(), spikevec_gid.cend(), [](const int& s) {
        return s > -1;
    });  // number of non-negative gid spikes

#if NRNMPI
    long gstat_array[NUM_STATS];
    if (corenrn_param.mpi_enable) {
        nrnmpi_long_allreduce_vec(stat_array, gstat_array, NUM_STATS, 1);
    } else {
        assert(sizeof(stat_array) == sizeof(gstat_array));
        std::memcpy(gstat_array, stat_array, sizeof(stat_array));
    }
#else
    const long(&gstat_array)[NUM_STATS] = stat_array;
#endif

    if (nrnmpi_myid == 0) {
        printf("\n\n Simulation Statistics\n");
        printf(" Number of cells: %ld\n", gstat_array[0]);
        printf(" Number of compartments: %ld\n", gstat_array[10]);
        printf(" Number of presyns: %ld\n", gstat_array[1]);
        printf(" Number of input presyns: %ld\n", gstat_array[2]);
        printf(" Number of synapses: %ld\n", gstat_array[3]);
        printf(" Number of point processes: %ld\n", gstat_array[4]);
        printf(" Number of transfer sources: %ld\n", gstat_array[12]);
        printf(" Number of transfer targets: %ld\n", gstat_array[11]);
        printf(" Number of spikes: %ld\n", gstat_array[5]);
        printf(" Number of spikes with non negative gid-s: %ld\n", gstat_array[6]);
    }
}
}  // namespace coreneuron


================================================
FILE: coreneuron/utils/nrn_stats.h
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

/**
 * @file nrn_stats.h
 * @date 25th Dec 2014
 * @brief Function declarations for the cell statistics
 *
 */

#pragma once
namespace coreneuron {
/** @brief Reports global cell statistics of the simulation
 *
 *  This routine prints the global number of cells, synapses of the simulation
 *  @param void
 *  @return void
 */
void report_cell_stats();

}  // namespace coreneuron


================================================
FILE: coreneuron/utils/nrnmutdec.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/
#pragma once

#if defined(_OPENMP)
#include <omp.h>

// This class respects the requirement *Mutex*
class OMP_Mutex {
  public:
    // Default constructible
    OMP_Mutex() {
        omp_init_lock(&mut_);
    }

    // Destructible
    ~OMP_Mutex() {
        omp_destroy_lock(&mut_);
    }

    // Not copyable
    OMP_Mutex(const OMP_Mutex&) = delete;
    OMP_Mutex& operator=(const OMP_Mutex&) = delete;

    // Not movable
    OMP_Mutex(const OMP_Mutex&&) = delete;
    OMP_Mutex& operator=(const OMP_Mutex&&) = delete;

    // Basic Lockable
    void lock() {
        omp_set_lock(&mut_);
    }

    void unlock() {
        omp_unset_lock(&mut_);
    }

    // Lockable
    bool try_lock() {
        return omp_test_lock(&mut_) != 0;
    }

  private:
    omp_lock_t mut_;
};

#else

// This class respects the requirement *Mutex*
class OMP_Mutex {
  public:
    // Default constructible
    OMP_Mutex() = default;

    // Destructible
    ~OMP_Mutex() = default;

    // Not copyable
    OMP_Mutex(const OMP_Mutex&) = delete;
    OMP_Mutex& operator=(const OMP_Mutex&) = delete;

    // Not movable
    OMP_Mutex(const OMP_Mutex&&) = delete;
    OMP_Mutex& operator=(const OMP_Mutex&&) = delete;

    // Basic Lockable
    void lock() {}

    void unlock() {}

    // Lockable
    bool try_lock() {
        return true;
    }
};
#endif


================================================
FILE: coreneuron/utils/nrnoc_aux.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#include <cstdlib>
#include <cstring>

#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/mpi/nrnmpi.h"
#include "coreneuron/coreneuron.hpp"
#include "coreneuron/utils/nrnoc_aux.hpp"
#include "coreneuron/apps/corenrn_parameters.hpp"

namespace coreneuron {
bool stoprun;
int v_structure_change;
int diam_changed;
#define MAXERRCOUNT 5
int hoc_errno_count;
const char* bbcore_write_version = "1.6";  // Allow multiple gid and PreSyn per real cell.

char* pnt_name(Point_process* pnt) {
    return corenrn.get_memb_func(pnt->_type).sym;
}

void nrn_exit(int err) {
#if NRNMPI
    if (corenrn_param.mpi_enable) {
        nrnmpi_finalize();
    }
#endif
    exit(err);
}

void hoc_execerror(const char* s1, const char* s2) {
    printf("error: %s %s\n", s1, s2 ? s2 : "");
    abort();
}

void hoc_warning(const char* s1, const char* s2) {
    printf("warning: %s %s\n", s1, s2 ? s2 : "");
}

double* makevector(size_t size) {
    return (double*) ecalloc(size, sizeof(char));
}

void freevector(double* p) {
    if (p) {
        free(p);
    }
}

double** makematrix(size_t nrows, size_t ncols) {
    double** matrix = (double**) emalloc(nrows * sizeof(double*));
    *matrix = (double*) emalloc(nrows * ncols * sizeof(double));
    for (size_t i = 1; i < nrows; i++)
        matrix[i] = matrix[i - 1] + ncols;
    return (matrix);
}

void freematrix(double** matrix) {
    if (matrix != nullptr) {
        free(*matrix);
        free(matrix);
    }
}

void* emalloc(size_t size) {
    void* memptr = malloc(size);
    assert(memptr);
    return memptr;
}

/* some user mod files may use this in VERBATIM */
void* hoc_Emalloc(size_t size) {
    return emalloc(size);
}
void hoc_malchk(void) {}

void* ecalloc(size_t n, size_t size) {
    if (n == 0) {
        return nullptr;
    }
    void* p = calloc(n, size);
    assert(p);
    return p;
}

void* erealloc(void* ptr, size_t size) {
    if (!ptr) {
        return emalloc(size);
    }
    void* p = realloc(ptr, size);
    assert(p);
    return p;
}

void* nrn_cacheline_alloc(void** memptr, size_t size) {
    alloc_memory(*memptr, size, 64);
    return *memptr;
}

/* used by nmodl and other c, c++ code */
double hoc_Exp(double x) {
    if (x < -700.) {
        return 0.;
    } else if (x > 700) {
        errno = ERANGE;
        if (++hoc_errno_count < MAXERRCOUNT) {
            fprintf(stderr, "exp(%g) out of range, returning exp(700)\n", x);
        }
        if (hoc_errno_count == MAXERRCOUNT) {
            fprintf(stderr, "No more errno warnings during this execution\n");
        }
        return exp(700.);
    }
    return exp(x);
}

/* check for version bbcore_write version between NEURON and CoreNEURON
 * abort in case of missmatch
 */
void check_bbcore_write_version(const char* version) {
    if (strcmp(version, bbcore_write_version) != 0) {
        if (nrnmpi_myid == 0)
            fprintf(stderr,
                    "Error: Incompatible binary input dataset version (expected %s, input %s)\n",
                    bbcore_write_version,
                    version);
        abort();
    }
}
}  // namespace coreneuron


================================================
FILE: coreneuron/utils/nrnoc_aux.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#pragma once

#include <cstddef>
#include "coreneuron/mechanism/mechanism.hpp"

namespace coreneuron {

extern int v_structure_change;
extern int diam_changed;
extern int structure_change_cnt;

extern char* pnt_name(Point_process* pnt);

extern void nrn_exit(int);

extern void* emalloc(size_t size);
extern void* ecalloc(size_t n, size_t size);
extern void* erealloc(void* ptr, size_t size);

extern double* makevector(size_t size); /* size in bytes */
extern double** makematrix(size_t nrow, size_t ncol);
void freevector(double*);
void freematrix(double**);

extern void hoc_execerror(const char*, const char*); /* print and abort */
extern void hoc_warning(const char*, const char*);

extern double hoc_Exp(double x);
}  // namespace coreneuron


================================================
FILE: coreneuron/utils/nrntimeout.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#include "coreneuron/nrnconf.h"
#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/mpi/nrnmpi.h"
#include "coreneuron/utils/utils.hpp"

#if NRNMPI

#include <csignal>
#include <sys/time.h>

/* if you are using any sampling based profiling tool,
setitimer will conflict with profiler. In that case,
user can disable setitimer which is just safety for
deadlock situations */
namespace coreneuron {
#if (defined(DISABLE_TIMEOUT) || defined(MINGW))

void nrn_timeout(int seconds) {}

#else

void (*nrntimeout_call)();
static double told;
static struct itimerval value;
static struct sigaction act, oact;

static void timed_out(int sig) {
    (void) sig; /* unused */
#if CORENRN_DEBUG
    printf("timed_out told=%g t=%g\n", told, t);
#endif
    if (nrn_threads->_t == told) { /* nothing has been accomplished since last signal*/
        printf("nrn_timeout t=%g\n", nrn_threads->_t);
        if (nrntimeout_call) {
            (*nrntimeout_call)();
        }
        nrn_abort(0);
    }
    told = nrn_threads->_t;
}

void nrn_timeout(int seconds) {
    if (nrnmpi_myid != 0) {
        return;
    }
#if CORENRN_DEBUG
    printf("nrn_timeout %d\n", seconds);
#endif
    if (seconds) {
        told = nrn_threads->_t;
        act.sa_handler = timed_out;
        act.sa_flags = SA_RESTART;
        if (sigaction(SIGALRM, &act, &oact)) {
            printf("sigaction failed\n");
            nrn_abort(0);
        }
    } else {
        sigaction(SIGALRM, &oact, (struct sigaction*) 0);
    }
    value.it_interval.tv_sec = seconds;
    value.it_interval.tv_usec = 0;
    value.it_value.tv_sec = seconds;
    value.it_value.tv_usec = 0;
    if (setitimer(ITIMER_REAL, &value, (struct itimerval*) 0)) {
        printf("setitimer failed\n");
        nrn_abort(0);
    }
}

#endif /* DISABLE_TIMEOUT */
}  // namespace coreneuron

#endif /*NRNMPI*/


================================================
FILE: coreneuron/utils/offload.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/
#pragma once
#define nrn_pragma_stringify(x) #x
#if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
#define nrn_pragma_acc(x)
#define nrn_pragma_omp(x) _Pragma(nrn_pragma_stringify(omp x))
#include <omp.h>
#elif defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
    defined(_OPENACC)
#define nrn_pragma_acc(x) _Pragma(nrn_pragma_stringify(acc x))
#define nrn_pragma_omp(x)
#include <openacc.h>
#else
#define nrn_pragma_acc(x)
#define nrn_pragma_omp(x)
#endif

#include <cstddef>
#include <stdexcept>
#include <string_view>

namespace coreneuron {
void cnrn_target_copyin_debug(std::string_view file,
                              int line,
                              std::size_t sizeof_T,
                              std::type_info const& typeid_T,
                              void const* h_ptr,
                              std::size_t len,
                              void* d_ptr);
void cnrn_target_delete_debug(std::string_view file,
                              int line,
                              std::size_t sizeof_T,
                              std::type_info const& typeid_T,
                              void const* h_ptr,
                              std::size_t len);
void cnrn_target_deviceptr_debug(std::string_view file,
                                 int line,
                                 std::type_info const& typeid_T,
                                 void const* h_ptr,
                                 void* d_ptr);
void cnrn_target_is_present_debug(std::string_view file,
                                  int line,
                                  std::type_info const& typeid_T,
                                  void const* h_ptr,
                                  void* d_ptr);
void cnrn_target_memcpy_to_device_debug(std::string_view file,
                                        int line,
                                        std::size_t sizeof_T,
                                        std::type_info const& typeid_T,
                                        void const* h_ptr,
                                        std::size_t len,
                                        void* d_ptr);
#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_UNIFIED_MEMORY) && \
    defined(__NVCOMPILER_MAJOR__) && defined(__NVCOMPILER_MINOR__) &&        \
    (__NVCOMPILER_MAJOR__ <= 22) && (__NVCOMPILER_MINOR__ <= 3)
// Homegrown implementation for buggy NVHPC versions (<=22.3), see
// https://forums.developer.nvidia.com/t/acc-deviceptr-does-not-work-in-openacc-code-dynamically-loaded-from-a-shared-library/211599
#define CORENEURON_ENABLE_PRESENT_TABLE
std::pair<void*, bool> cnrn_target_deviceptr_impl(bool must_be_present_or_null, void const* h_ptr);
void cnrn_target_copyin_update_present_table(void const* h_ptr, void* d_ptr, std::size_t len);
void cnrn_target_delete_update_present_table(void const* h_ptr, std::size_t len);
#endif

template <typename T>
T* cnrn_target_deviceptr_or_present(std::string_view file,
                                    int line,
                                    bool must_be_present_or_null,
                                    const T* h_ptr) {
    T* d_ptr{};
    bool error{false};
#ifdef CORENEURON_ENABLE_PRESENT_TABLE
    auto const d_ptr_and_error = cnrn_target_deviceptr_impl(must_be_present_or_null, h_ptr);
    d_ptr = static_cast<T*>(d_ptr_and_error.first);
    error = d_ptr_and_error.second;
#elif defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
    defined(_OPENACC)
    d_ptr = static_cast<T*>(acc_deviceptr(const_cast<T*>(h_ptr)));
#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
    defined(_OPENMP)
    if (must_be_present_or_null || omp_target_is_present(h_ptr, omp_get_default_device())) {
        nrn_pragma_omp(target data use_device_ptr(h_ptr))
        { d_ptr = const_cast<T*>(h_ptr); }
    }
#else
    if (must_be_present_or_null && h_ptr) {
        throw std::runtime_error(
            "cnrn_target_deviceptr() not implemented without OpenACC/OpenMP and gpu build");
    }
#endif
    if (must_be_present_or_null) {
        cnrn_target_deviceptr_debug(file, line, typeid(T), h_ptr, d_ptr);
    } else {
        cnrn_target_is_present_debug(file, line, typeid(T), h_ptr, d_ptr);
    }
    if (error) {
        throw std::runtime_error(
            "cnrn_target_deviceptr() encountered an error, you may want to try setting "
            "CORENEURON_GPU_DEBUG=1");
    }
    return d_ptr;
}

template <typename T>
T* cnrn_target_copyin(std::string_view file, int line, const T* h_ptr, std::size_t len = 1) {
    T* d_ptr{};
#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
    defined(_OPENACC)
    d_ptr = static_cast<T*>(acc_copyin(const_cast<T*>(h_ptr), len * sizeof(T)));
#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
    defined(_OPENMP)
    nrn_pragma_omp(target enter data map(to : h_ptr[:len]))
    nrn_pragma_omp(target data use_device_ptr(h_ptr))
    { d_ptr = const_cast<T*>(h_ptr); }
#else
    throw std::runtime_error(
        "cnrn_target_copyin() not implemented without OpenACC/OpenMP and gpu build");
#endif
#ifdef CORENEURON_ENABLE_PRESENT_TABLE
    cnrn_target_copyin_update_present_table(h_ptr, d_ptr, len * sizeof(T));
#endif
    cnrn_target_copyin_debug(file, line, sizeof(T), typeid(T), h_ptr, len, d_ptr);
    return d_ptr;
}

template <typename T>
void cnrn_target_delete(std::string_view file, int line, T* h_ptr, std::size_t len = 1) {
    cnrn_target_delete_debug(file, line, sizeof(T), typeid(T), h_ptr, len);
#ifdef CORENEURON_ENABLE_PRESENT_TABLE
    cnrn_target_delete_update_present_table(h_ptr, len * sizeof(T));
#endif
#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
    defined(_OPENACC)
    acc_delete(h_ptr, len * sizeof(T));
#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
    defined(_OPENMP)
    nrn_pragma_omp(target exit data map(delete : h_ptr[:len]))
#else
    throw std::runtime_error(
        "cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build");
#endif
}

template <typename T>
void cnrn_target_memcpy_to_device(std::string_view file,
                                  int line,
                                  T* d_ptr,
                                  const T* h_ptr,
                                  std::size_t len = 1) {
    cnrn_target_memcpy_to_device_debug(file, line, sizeof(T), typeid(T), h_ptr, len, d_ptr);
#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
    defined(_OPENACC)
    acc_memcpy_to_device(d_ptr, const_cast<T*>(h_ptr), len * sizeof(T));
#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
    defined(_OPENMP)
    omp_target_memcpy(d_ptr,
                      const_cast<T*>(h_ptr),
                      len * sizeof(T),
                      0,
                      0,
                      omp_get_default_device(),
                      omp_get_initial_device());
#else
    throw std::runtime_error(
        "cnrn_target_memcpy_to_device() not implemented without OpenACC/OpenMP and gpu build");
#endif
}

template <typename T>
void cnrn_target_update_on_device(std::string_view file,
                                  int line,
                                  const T* h_ptr,
                                  std::size_t len = 1) {
    auto* d_ptr = cnrn_target_deviceptr_or_present(file, line, true, h_ptr);
    cnrn_target_memcpy_to_device(file, line, d_ptr, h_ptr);
}

// Replace with std::source_location once we have C++20
#define cnrn_target_copyin(...) cnrn_target_copyin(__FILE__, __LINE__, __VA_ARGS__)
#define cnrn_target_delete(...) cnrn_target_delete(__FILE__, __LINE__, __VA_ARGS__)
#define cnrn_target_is_present(...) \
    cnrn_target_deviceptr_or_present(__FILE__, __LINE__, false, __VA_ARGS__)
#define cnrn_target_deviceptr(...) \
    cnrn_target_deviceptr_or_present(__FILE__, __LINE__, true, __VA_ARGS__)
#define cnrn_target_memcpy_to_device(...) \
    cnrn_target_memcpy_to_device(__FILE__, __LINE__, __VA_ARGS__)
#define cnrn_target_update_on_device(...) \
    cnrn_target_update_on_device(__FILE__, __LINE__, __VA_ARGS__)

}  // namespace coreneuron


================================================
FILE: coreneuron/utils/profile/profiler_interface.h
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#pragma once

#include <initializer_list>
#include <type_traits>

#if defined(CORENEURON_CALIPER)
#include <caliper/cali.h>
#endif

#ifdef CORENEURON_CUDA_PROFILING
#include <cuda_profiler_api.h>
#endif

#if defined(CRAYPAT)
#include <pat_api.h>
#endif

#if defined(TAU)
#include <TAU.h>
#endif

#if defined(LIKWID_PERFMON)
#include <likwid.h>
#endif

namespace coreneuron {

namespace detail {

/*! \class Instrumentor
 *  \brief Instrumentation infrastructure for benchmarking and profiling.
 *
 *  The Instrumentor class exposes static methods that can be used to
 *  toggle with fine-grained resolution the profiling of specific
 *  areas within the code.
 */
template <class... TProfilerImpl>
struct Instrumentor {
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wunused-value"
    /*! \fn phase_begin
     *  \brief Activate the collection of profiling data within a code region.
     *
     *  This function semantically defines the beginning of a region
     *  of code that the user wishes to profile.
     *  Loops through all enabled profilers and calls the relevant
     *  `phase_begin` function.
     *  This function should have a non-empty implementation only for
     *  profilers that allow multiple code regions with different names
     *  to be profiled concurrently.
     *
     *  @param name the (unique) identifier of the code region to be profiled
     */
    inline static void phase_begin(const char* name) {
        std::initializer_list<int>{(TProfilerImpl::phase_begin(name), 0)...};
    }

    /*! \fn phase_end
     *  \brief Deactivate the collection of profiling data within a code region.
     *
     *  This function semantically defines the end of a region
     *  of code that the user wishes to profile.
     *  Loops through all enabled profilers and calls the relevant
     *  `phase_end` function.
     *  This function should have a non-empty implementation only for
     *  profilers that allow multiple code regions with different names
     *  to be profiled concurrently.
     *
     *  @param name the (unique) identifier of the code region to be profiled
     */
    inline static void phase_end(const char* name) {
        std::initializer_list<int>{(TProfilerImpl::phase_end(name), 0)...};
    }

    /*! \fn start_profile
     *  \brief Globally activate the collection of profiling data.
     *
     *  Activate the collection of profiler data without defining
     *  a region of interest with a given name, as opposed to `phase_begin`.
     *  Loops through all enabled profilers and calls the relevant
     *  `start_profile` function.
     *  This function should have a non-empty implementation only for
     *  profilers that expose simply a global begin/end interface, without
     *  named regions.
     */
    inline static void start_profile() {
        std::initializer_list<int>{(TProfilerImpl::start_profile(), 0)...};
    }

    /*! \fn stop_profile
     *  \brief Globally deactivate the collection of profiling data.
     *
     *  Deactivate the collection of profiler data without defining
     *  a region of interest with a given name, as opposed to `phase_end`.
     *  Loops through all enabled profilers and calls the relevant
     *  `stop_profile` function.
     *  This function should have a non-empty implementation only for
     *  profilers that expose simply a global begin/end interface, without
     *  named regions.
     */
    inline static void stop_profile() {
        std::initializer_list<int>{(TProfilerImpl::stop_profile(), 0)...};
    }

    /*! \fn init_profile
     *  \brief Initialize the profiler.
     *
     *  Initialize a profiler's internal structure, without activating yet
     *  any data collection, similar in concept to MPI_Init.
     *  Loops through all enabled profilers and calls the relevant
     *  `init_profile` function.
     *  This function should have a non-empty implementation only for
     *  profilers that require special initialization, typically before
     *  any memory allocation is done.
     */
    inline static void init_profile() {
        std::initializer_list<int>{(TProfilerImpl::init_profile(), 0)...};
    }

    /*! \fn finalize_profile
     *  \brief Finalize the profiler.
     *
     *  Finalize a profiler's internal structure, without activating yet
     *  any data collection, similar in concept to MPI_Finalize.
     *  Loops through all enabled profilers and calls the relevant
     *  `finalize_profile` function.
     *  This function should have a non-empty implementation only for
     *  profilers that require special finalization.
     */
    inline static void finalize_profile() {
        std::initializer_list<int>{(TProfilerImpl::finalize_profile(), 0)...};
    }
#pragma clang diagnostic pop
};

#if defined(CORENEURON_CALIPER)

struct Caliper {
    inline static void phase_begin(const char* name) {
        CALI_MARK_BEGIN(name);
    };

    inline static void phase_end(const char* name) {
        CALI_MARK_END(name);
    };

    inline static void start_profile(){};

    inline static void stop_profile(){};

    inline static void init_profile(){};

    inline static void finalize_profile(){};
};

#endif

#ifdef CORENEURON_CUDA_PROFILING

struct CudaProfiling {
    inline static void phase_begin(const char* name){};

    inline static void phase_end(const char* name){};

    inline static void start_profile() {
        cudaProfilerStart();
    };

    inline static void stop_profile() {
        cudaProfilerStop();
    };

    inline static void init_profile(){};

    inline static void finalize_profile(){};
};

#endif

#if defined(CRAYPAT)

struct CrayPat {
    inline static void phase_begin(const char* name){};

    inline static void phase_end(const char* name){};

    inline static void start_profile() {
        PAT_record(PAT_STATE_ON);
    };

    inline static void stop_profile() {
        PAT_record(PAT_STATE_OFF);
    };

    inline static void init_profile(){};

    inline static void finalize_profile(){};
};
#endif

#if defined(TAU)

struct Tau {
    inline static void phase_begin(const char* name){};

    inline static void phase_end(const char* name){};

    inline static void start_profile() {
        TAU_ENABLE_INSTRUMENTATION();
    };

    inline static void stop_profile() {
        TAU_DISABLE_INSTRUMENTATION();
    };

    inline static void init_profile(){};

    inline static void finalize_profile(){};
};

#endif

#if defined(LIKWID_PERFMON)

struct Likwid {
    inline static void phase_begin(const char* name) {
        LIKWID_MARKER_START(name);
    };

    inline static void phase_end(const char* name) {
        LIKWID_MARKER_STOP(name);
    };

    inline static void start_profile(){};

    inline static void stop_profile(){};

    inline static void init_profile() {
        LIKWID_MARKER_INIT;

#pragma omp parallel
        { LIKWID_MARKER_THREADINIT; }
    };

    inline static void finalize_profile() {
        LIKWID_MARKER_CLOSE;
    };
};

#endif

struct NullInstrumentor {
    inline static void phase_begin(const char* name){};
    inline static void phase_end(const char* name){};
    inline static void start_profile(){};
    inline static void stop_profile(){};
    inline static void init_profile(){};
    inline static void finalize_profile(){};
};

using InstrumentorImpl = detail::Instrumentor<
#if defined CORENEURON_CALIPER
    detail::Caliper,
#endif
#ifdef CORENEURON_CUDA_PROFILING
    detail::CudaProfiling,
#endif
#if defined(CRAYPAT)
    detail::CrayPat,
#endif
#if defined(TAU)
    detail::Tau,
#endif
#if defined(LIKWID_PERFMON)
    detail::Likwid,
#endif
    detail::NullInstrumentor>;
}  // namespace detail

namespace Instrumentor {
struct phase {
    const char* phase_name;
    phase(const char* name)
        : phase_name(name) {
        detail::InstrumentorImpl::phase_begin(phase_name);
    }
    ~phase() {
        detail::InstrumentorImpl::phase_end(phase_name);
    }
};

inline static void start_profile() {
    detail::InstrumentorImpl::start_profile();
}

inline static void stop_profile() {
    detail::InstrumentorImpl::stop_profile();
}

inline static void phase_begin(const char* name) {
    detail::InstrumentorImpl::phase_begin(name);
}

inline static void phase_end(const char* name) {
    detail::InstrumentorImpl::phase_end(name);
}

inline static void init_profile() {
    detail::InstrumentorImpl::init_profile();
}

inline static void finalize_profile() {
    detail::InstrumentorImpl::finalize_profile();
}
}  // namespace Instrumentor

}  // namespace coreneuron


================================================
FILE: coreneuron/utils/progressbar/progressbar.cpp
================================================
/**
 * \file
 * \author Trevor Fountain
 * \author Johannes Buchner
 * \author Erik Garrison
 * \date 2010-2014
 * \copyright BSD 3-Clause
 *
 * progressbar -- a C class (by convention) for displaying progress
 * on the command line (to stdout).
 */
#include "coreneuron/utils/progressbar/progressbar.hpp"

#include <cassert>
#include <cstddef>
#include <climits>
#include <unistd.h>

///  How wide we assume the screen is if termcap fails.
enum { DEFAULT_SCREEN_WIDTH = 80 };

/// The smallest that the bar can ever be (not including borders)
enum { MINIMUM_BAR_WIDTH = 10 };

/// The format in which the estimated remaining time will be reported
static const char* const ETA_FORMAT = "t: %-6.2f ETA:%2dh%02dm%02ds";

/// The maximum number of characters that the ETA_FORMAT can ever yield
enum { ETA_FORMAT_LENGTH = 13 };

/// Amount of screen width taken up by whitespace (i.e. whitespace between label/bar/ETA components)
enum { WHITESPACE_LENGTH = 2 };

/// The amount of width taken up by the border of the bar component.
enum { BAR_BORDER_WIDTH = 2 };

/// The maximum number of bar redraws (to avoid frequent output in long runs)
enum { BAR_DRAW_COUNT_MAX = 500 };

enum { BAR_DRAW_INTERVAL = 1, BAR_DRAW_INTERVAL_NOTTY = 5 };

/// Models a duration of time broken into hour/minute/second components. The number of seconds
/// should be less than the
/// number of seconds in one minute, and the number of minutes should be less than the number of
/// minutes in one hour.
struct progressbar_time_components {
    int hours;
    int minutes;
    int seconds;
};

static void progressbar_draw(const progressbar* bar);
static int progressbar_remaining_seconds(const progressbar* bar);

/**
 * Create a new progress bar with the specified label, max number of steps, and format string.
 * Note that `format` must be exactly three characters long, e.g. "<->" to render a progress
 * bar like "<---------->". Returns nullptr if there isn't enough memory to allocate a progressbar
 */
progressbar* progressbar_new_with_format(const char* label, unsigned long max, const char* format) {
    auto* new_bar = static_cast<progressbar*>(malloc(sizeof(progressbar)));
    if (new_bar == nullptr) {
        return nullptr;
    }

    new_bar->max = max;
    new_bar->value = 0;
    new_bar->draw_time_interval = isatty(STDOUT_FILENO) ? BAR_DRAW_INTERVAL
                                                        : BAR_DRAW_INTERVAL_NOTTY;
    new_bar->t = 0;
    new_bar->start = time(nullptr);
    assert(3 == strlen(format) && "format must be 3 characters in length");
    new_bar->format.begin = format[0];
    new_bar->format.fill = format[1];
    new_bar->format.end = format[2];

    progressbar_update_label(new_bar, label);
    progressbar_draw(new_bar);
    new_bar->prev_t = difftime(time(nullptr), new_bar->start);
    new_bar->drawn_count = 1;

    return new_bar;
}

/**
 * Create a new progress bar with the specified label and max number of steps.
 */
progressbar* progressbar_new(const char* label, unsigned long max) {
    return progressbar_new_with_format(label, max, "|=|");
}

void progressbar_update_label(progressbar* bar, const char* label) {
    bar->label = label;
}

/**
 * Delete an existing progress bar.
 */
void progressbar_free(progressbar* bar) {
    free(bar);
}

/**
 * Increment an existing progressbar by `value` steps.
 * Additionally issues a redraw in case a certain time interval has elapsed (min: 1sec)
 * Reasons for a larger interval are:
 *  - Stdout is not TTY
 *  - Respect BAR_DRAW_COUNT_MAX
 */
void progressbar_update(progressbar* bar, unsigned long value, double t) {
    bar->value = value;
    bar->t = t;
    int sim_time = difftime(time(nullptr), bar->start);

    // If there is not enough time passed to redraw the progress bar return
    if ((sim_time - bar->prev_t) < bar->draw_time_interval) {
        return;
    }

    progressbar_draw(bar);

    bar->drawn_count++;
    bar->prev_t = sim_time;

    if (bar->drawn_count >= BAR_DRAW_COUNT_MAX || sim_time < 15) {
        // Dont change the interval after the limit. Simulation should be over any moment and
        // avoid the calc of draw_time_interval which could raise DIV/0
        // Also, dont do it the first 15sec to avoid really bad estimates which could potentially
        // delay a better estimate too far away in the future.
        return;
    }

    // Sample ETA to calculate the next interval until the redraw of the progressbar
    int eta_s = progressbar_remaining_seconds(bar);
    bar->draw_time_interval = eta_s / (BAR_DRAW_COUNT_MAX - bar->drawn_count);

    if (bar->draw_time_interval < BAR_DRAW_INTERVAL_NOTTY) {
        bar->draw_time_interval = isatty(STDOUT_FILENO)
                                      ? ((bar->draw_time_interval < BAR_DRAW_INTERVAL)
                                             ? BAR_DRAW_INTERVAL
                                             : bar->draw_time_interval)
                                      : BAR_DRAW_INTERVAL_NOTTY;
    }
}

/**
 * Increment an existing progressbar by a single step.
 */
void progressbar_inc(progressbar* bar, double t) {
    progressbar_update(bar, bar->value + 1, t);
}

static void progressbar_write_char(FILE* file, const int ch, const size_t times) {
    for (std::size_t i = 0; i < times; ++i) {
        fputc(ch, file);
    }
}

static int progressbar_max(int x, int y) {
    return x > y ? x : y;
}

static unsigned int get_screen_width(void) {
    return DEFAULT_SCREEN_WIDTH;
}

static int progressbar_bar_width(int screen_width, int label_length) {
    return progressbar_max(MINIMUM_BAR_WIDTH,
                           screen_width - label_length - ETA_FORMAT_LENGTH - WHITESPACE_LENGTH);
}

static int progressbar_label_width(int screen_width, int label_length, int bar_width) {
    int eta_width = ETA_FORMAT_LENGTH;

    // If the progressbar is too wide to fit on the screen, we must sacrifice the label.
    if (label_length + 1 + bar_width + 1 + ETA_FORMAT_LENGTH > screen_width) {
        return progressbar_max(0, screen_width - bar_width - eta_width - WHITESPACE_LENGTH);
    } else {
        return label_length;
    }
}

static int progressbar_remaining_seconds(const progressbar* bar) {
    double offset = difftime(time(nullptr), bar->start);
    if (bar->value > 0 && offset > 0) {
        return (offset / (double) bar->value) * (bar->max - bar->value);
    } else {
        return 0;
    }
}

static progressbar_time_components progressbar_calc_time_components(int seconds) {
    int hours = seconds / 3600;
    seconds -= hours * 3600;
    int minutes = seconds / 60;
    seconds -= minutes * 60;

    progressbar_time_components components = {hours, minutes, seconds};
    return components;
}

static void progressbar_draw(const progressbar* bar) {
    int screen_width = get_screen_width();
    int label_length = strlen(bar->label);
    int bar_width = progressbar_bar_width(screen_width, label_length);
    int label_width = progressbar_label_width(screen_width, label_length, bar_width);

    int progressbar_completed = (bar->value >= bar->max);
    int bar_piece_count = bar_width - BAR_BORDER_WIDTH;
    int bar_piece_current = (progressbar_completed)
                                ? bar_piece_count
                                : bar_piece_count * ((double) bar->value / bar->max);

    progressbar_time_components eta =
        (progressbar_completed)
            ? progressbar_calc_time_components(difftime(time(nullptr), bar->start))
            : progressbar_calc_time_components(progressbar_remaining_seconds(bar));

    if (label_width == 0) {
        // The label would usually have a trailing space, but in the case that we don't print
        // a label, the bar can use that space instead.
        bar_width += 1;
    } else {
        // Draw the label
        fwrite(bar->label, 1, label_width, stdout);
        fputc(' ', stdout);
    }

    // Draw the progressbar
    fputc(bar->format.begin, stdout);
    progressbar_write_char(stdout, bar->format.fill, bar_piece_current);
    progressbar_write_char(stdout, ' ', bar_piece_count - bar_piece_current);
    fputc(bar->format.end, stdout);

    // Draw the ETA
    fputc(' ', stdout);
    fprintf(stdout, ETA_FORMAT, bar->t, eta.hours, eta.minutes, eta.seconds);
    fputc('\r', stdout);
    fflush(stdout);
}

/**
 * Finish a progressbar, indicating 100% completion, and free it.
 */
void progressbar_finish(progressbar* bar) {
    // Make sure we fill the progressbar so things look complete.
    progressbar_draw(bar);

    // Print a newline, so that future outputs to stdout look prettier
    fprintf(stdout, "\n");

    // We've finished with this progressbar, so go ahead and free it.
    progressbar_free(bar);
}


================================================
FILE: coreneuron/utils/progressbar/progressbar.hpp
================================================
/**
 * \file
 * \author Trevor Fountain
 * \author Johannes Buchner
 * \author Erik Garrison
 * \date 2010-2014
 * \copyright BSD 3-Clause
 *
 * progressbar -- a C class (by convention) for displaying progress
 * on the command line (to stderr).
 */
#pragma once
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/**
 * Progressbar data structure (do not modify or create directly)
 */
struct progressbar {
    /// maximum value
    unsigned long max;

    /// current value
    unsigned long value;

    /// value of the previous progress bar drawn in output
    unsigned long prev_sample_value;

    /// time interval between consecutive bar redraws (seconds)
    time_t draw_time_interval;

    /// number of redrawn bars
    unsigned long drawn_count;

    /// time progressbar was started
    time_t start;

    /// time progressbar was drawn for last time
    time_t prev_t;

    /// label
    const char* label;

    /// current time (added for simulation)
    double t;

    /// characters for the beginning, filling and end of the
    /// progressbar. E.g. |###    | has |#|
    struct {
        char begin;
        char fill;
        char end;
    } format;
};

/// Create a new progressbar with the specified label and number of steps.
///
/// @param label The label that will prefix the progressbar.
/// @param max The number of times the progressbar must be incremented before it is considered
/// complete, or, in other words, the number of tasks that this progressbar is tracking.
/// @return A progressbar configured with the provided arguments. Note that the user is responsible
/// for disposing of the progressbar via progressbar_finish when finished with the object.
progressbar* progressbar_new(const char* label, unsigned long max);

/// Create a new progressbar with the specified label, number of steps, and format string.
///
/// @param label The label that will prefix the progressbar.
/// @param max The number of times the progressbar must be incremented before it is considered
/// complete, or, in other words, the number of tasks that this progressbar is tracking.
/// @param format The format of the progressbar. The string provided must be three characters, and
/// it will be interpretted with the first character as the left border of the bar, the second
/// character of the bar and the third character as the right border of the bar. For example,
/// "<->" would result in a bar formatted like "<------     >".
///
/// @return A progressbar configured with the provided arguments. Note that the user is responsible
/// for disposing of the progressbar via progressbar_finish when finished with the object.
progressbar* progressbar_new_with_format(const char* label, unsigned long max, const char* format);

/// Free an existing progress bar. Don't call this directly; call *progressbar_finish* instead.
void progressbar_free(progressbar* bar);

/// Increment the given progressbar. Don't increment past the initialized # of steps, though.
void progressbar_inc(progressbar* bar, double t);

/// Set the current status on the given progressbar.
void progressbar_update(progressbar* bar, unsigned long value, double t);

/// Set the label of the progressbar. Note that no rendering is done. The label is simply set so
/// that the next rendering will use the new label. To immediately see the new label, call
/// progressbar_draw.
/// Does not update display or copy the label
void progressbar_update_label(progressbar* bar, const char* label);

/// Finalize (and free!) a progressbar. Call this when you're done, or if you break out
/// partway through.
void progressbar_finish(progressbar* bar);


================================================
FILE: coreneuron/utils/randoms/nrnran123.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/
#include "coreneuron/gpu/nrn_acc_manager.hpp"
#include "coreneuron/mpi/core/nrnmpi.hpp"
#include "coreneuron/utils/memory.h"
#include "coreneuron/utils/nrnmutdec.hpp"
#include "coreneuron/utils/randoms/nrnran123.h"

#ifdef CORENEURON_USE_BOOST_POOL
#include <boost/pool/pool_alloc.hpp>
#include <unordered_map>
#endif

#include <cmath>
#include <iostream>
#include <memory>
#include <mutex>

// Defining these attributes seems to help nvc++ in OpenMP target offload mode.
#if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
    defined(_OPENMP) && defined(__CUDACC__)
#define CORENRN_HOST_DEVICE __host__ __device__
#else
#define CORENRN_HOST_DEVICE
#endif

namespace {
#ifdef CORENEURON_USE_BOOST_POOL
/** Tag type for use with boost::fast_pool_allocator that forwards to
 *  coreneuron::[de]allocate_unified(). Using a Random123-specific type here
 *  makes sure that allocations do not come from the same global pool as other
 *  usage of boost pools for objects with sizeof == sizeof(nrnran123_State).
 *
 *  The messy m_block_sizes map is just because `deallocate_unified` uses sized
 *  deallocations, but the Boost pool allocators don't. Because this is hidden
 *  behind the pool mechanism, these methods are not called very often and the
 *  overhead is minimal.
 */
struct random123_allocate_unified {
    using size_type = std::size_t;
    using difference_type = std::size_t;
    static char* malloc(const size_type bytes) {
        std::lock_guard<std::mutex> const lock{m_mutex};
        static_cast<void>(lock);
        auto* buffer = coreneuron::allocate_unified(bytes);
        m_block_sizes[buffer] = bytes;
        return reinterpret_cast<char*>(buffer);
    }
    static void free(char* const block) {
        std::lock_guard<std::mutex> const lock{m_mutex};
        static_cast<void>(lock);
        auto const iter = m_block_sizes.find(block);
        assert(iter != m_block_sizes.end());
        auto const size = iter->second;
        m_block_sizes.erase(iter);
        return coreneuron::deallocate_unified(block, size);
    }
    static std::mutex m_mutex;
    static std::unordered_map<void*, std::size_t> m_block_sizes;
};

std::mutex random123_allocate_unified::m_mutex{};
std::unordered_map<void*, std::size_t> random123_allocate_unified::m_block_sizes{};

using random123_allocator =
    boost::fast_pool_allocator<coreneuron::nrnran123_State, random123_allocate_unified>;
#else
using random123_allocator = coreneuron::unified_allocator<coreneuron::nrnran123_State>;
#endif
/* Global data structure per process. Using a unique_ptr here causes [minor]
 * problems because its destructor can be called very late during application
 * shutdown. If the destructor calls cudaFree and the CUDA runtime has already
 * been shut down then tools like cuda-memcheck reports errors.
 */
OMP_Mutex g_instance_count_mutex;
std::size_t g_instance_count{};

#ifdef __CUDACC__
#define g_k_qualifiers __device__ __constant__
#else
#define g_k_qualifiers
#endif
g_k_qualifiers philox4x32_key_t g_k{};
// Cannot refer to g_k directly from a nrn_pragma_acc(routine seq) method like
// coreneuron_random123_philox4x32_helper, and cannot have this inlined there at
// higher optimisation levels
__attribute__((noinline)) philox4x32_key_t& global_state() {
    return g_k;
}
}  // namespace

CORENRN_HOST_DEVICE philox4x32_ctr_t
coreneuron_random123_philox4x32_helper(coreneuron::nrnran123_State* s) {
    return philox4x32(s->c, global_state());
}

namespace coreneuron {
std::size_t nrnran123_instance_count() {
    return g_instance_count;
}

/* if one sets the global, one should reset all the stream sequences. */
uint32_t nrnran123_get_globalindex() {
    return global_state().v[0];
}

/* nrn123 streams are created from cpu launcher routine */
void nrnran123_set_globalindex(uint32_t gix) {
    // If the global seed is changing then we shouldn't have any active streams.
    auto& g_k = global_state();
    {
        std::lock_guard<OMP_Mutex> _{g_instance_count_mutex};
        if (g_instance_count != 0 && nrnmpi_myid == 0) {
            std::cout
                << "nrnran123_set_globalindex(" << gix
                << ") called when a non-zero number of Random123 streams (" << g_instance_count
                << ") were active. This is not safe, some streams will remember the old value ("
                << g_k.v[0] << ')' << std::endl;
        }
    }
    if (g_k.v[0] != gix) {
        g_k.v[0] = gix;
        if (coreneuron::gpu_enabled()) {
#ifdef __CUDACC__
            {
                auto const code = cudaMemcpyToSymbol(g_k, &g_k, sizeof(g_k));
                assert(code == cudaSuccess);
            }
            {
                auto const code = cudaDeviceSynchronize();
                assert(code == cudaSuccess);
            }
#else
            nrn_pragma_acc(update device(g_k))
            nrn_pragma_omp(target update to(g_k))
#endif
        }
    }
}

void nrnran123_initialise_global_state_on_device() {
    if (coreneuron::gpu_enabled()) {
#ifndef __CUDACC__
        nrn_pragma_acc(enter data copyin(g_k))
#endif
    }
}

void nrnran123_destroy_global_state_on_device() {
    if (coreneuron::gpu_enabled()) {
#ifndef __CUDACC__
        nrn_pragma_acc(exit data delete (g_k))
#endif
    }
}

/** @brief Allocate a new Random123 stream.
 *  @todo  It would be nicer if the API return type was
 *  std::unique_ptr<nrnran123_State, ...not specified...>, so we could use a
 *  custom allocator/deleter and avoid the (fragile) need for matching
 *  nrnran123_deletestream calls.
 */
nrnran123_State* nrnran123_newstream3(uint32_t id1,
                                      uint32_t id2,
                                      uint32_t id3,
                                      bool use_unified_memory) {
    // The `use_unified_memory` argument is an implementation detail to keep the
    // old behaviour that some Random123 streams that are known to only be used
    // from the CPU are allocated using new/delete instead of unified memory.
    // See OPENACC_EXCLUDED_FILES in coreneuron/CMakeLists.txt. If we dropped
    // this feature then we could always use coreneuron::unified_allocator.
#ifndef CORENEURON_ENABLE_GPU
    if (use_unified_memory) {
        throw std::runtime_error("Tried to use CUDA unified memory in a non-GPU build.");
    }
#endif
    nrnran123_State* s{nullptr};
    if (use_unified_memory) {
        s = coreneuron::allocate_unique<nrnran123_State>(random123_allocator{}).release();
    } else {
        s = new nrnran123_State{};
    }
    s->c.v[0] = 0;
    s->c.v[1] = id3;
    s->c.v[2] = id1;
    s->c.v[3] = id2;
    nrnran123_setseq(s, 0, 0);
    {
        std::lock_guard<OMP_Mutex> _{g_instance_count_mutex};
        ++g_instance_count;
    }
    return s;
}

/* nrn123 streams are destroyed from cpu launcher routine */
void nrnran123_deletestream(nrnran123_State* s, bool use_unified_memory) {
#ifndef CORENEURON_ENABLE_GPU
    if (use_unified_memory) {
        throw std::runtime_error("Tried to use CUDA unified memory in a non-GPU build.");
    }
#endif
    {
        std::lock_guard<OMP_Mutex> _{g_instance_count_mutex};
        --g_instance_count;
    }
    if (use_unified_memory) {
        std::unique_ptr<nrnran123_State, coreneuron::alloc_deleter<random123_allocator>> _{s};
    } else {
        delete s;
    }
}
}  // namespace coreneuron


================================================
FILE: coreneuron/utils/randoms/nrnran123.h
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/
#pragma once

/* interface to Random123 */
/* http://www.thesalmons.org/john/random123/papers/random123sc11.pdf */

/*
The 4x32 generators utilize a uint32x4 counter and uint32x4 key to transform
into an almost cryptographic quality uint32x4 random result.
There are many possibilites for balancing the sharing of the internal
state instances while reserving a uint32 counter for the stream sequence
and reserving other portions of the counter vector for stream identifiers
and global index used by all streams.

We currently provide a single instance by default in which the policy is
to use the 0th counter uint32 as the stream sequence, words 2 and 3 as the
stream identifier, and word 0 of the key as the global index. Unused words
are constant uint32 0.

It is also possible to use Random123 directly without reference to this
interface. See Random123-1.02/docs/html/index.html
of the full distribution available from
http://www.deshawresearch.com/resources_random123.html
*/

#ifdef __bgclang__
#define R123_USE_MULHILO64_MULHI_INTRIN 0
#define R123_USE_GNU_UINT128            1
#endif

#include "coreneuron/utils/offload.hpp"

#include <Random123/philox.h>
#include <inttypes.h>

#include <cmath>

// Some files are compiled with DISABLE_OPENACC, and some builds have no GPU
// support at all. In these two cases, request that the random123 state is
// allocated using new/delete instead of CUDA unified memory.
#if defined(CORENEURON_ENABLE_GPU) && !defined(DISABLE_OPENACC)
#define CORENRN_RAN123_USE_UNIFIED_MEMORY true
#else
#define CORENRN_RAN123_USE_UNIFIED_MEMORY false
#endif

namespace coreneuron {

struct nrnran123_State {
    philox4x32_ctr_t c;
    philox4x32_ctr_t r;
    char which_;
};

}  // namespace coreneuron

/** @brief Provide a helper function in global namespace that is declared target for OpenMP
 * offloading to function correctly with NVHPC
 */
nrn_pragma_acc(routine seq)
nrn_pragma_omp(declare target)
philox4x32_ctr_t coreneuron_random123_philox4x32_helper(coreneuron::nrnran123_State* s);
nrn_pragma_omp(end declare target)

namespace coreneuron {
void nrnran123_initialise_global_state_on_device();
void nrnran123_destroy_global_state_on_device();

/* global index. eg. run number */
/* all generator instances share this global index */
void nrnran123_set_globalindex(uint32_t gix);
uint32_t nrnran123_get_globalindex();

// Utilities used for calculating model size, only called from the CPU.
std::size_t nrnran123_instance_count();
inline std::size_t nrnran123_state_size() {
    return sizeof(nrnran123_State);
}

/* routines for creating and deleting streams are called from cpu */
nrnran123_State* nrnran123_newstream3(uint32_t id1,
                                      uint32_t id2,
                                      uint32_t id3,
                                      bool use_unified_memory = CORENRN_RAN123_USE_UNIFIED_MEMORY);
inline nrnran123_State* nrnran123_newstream(
    uint32_t id1,
    uint32_t id2,
    bool use_unified_memory = CORENRN_RAN123_USE_UNIFIED_MEMORY) {
    return nrnran123_newstream3(id1, id2, 0, use_unified_memory);
}
void nrnran123_deletestream(nrnran123_State* s,
                            bool use_unified_memory = CORENRN_RAN123_USE_UNIFIED_MEMORY);

/* minimal data stream */
constexpr void nrnran123_getseq(nrnran123_State* s, uint32_t* seq, char* which) {
    *seq = s->c.v[0];
    *which = s->which_;
}
constexpr void nrnran123_getids(nrnran123_State* s, uint32_t* id1, uint32_t* id2) {
    *id1 = s->c.v[2];
    *id2 = s->c.v[3];
}
constexpr void nrnran123_getids3(nrnran123_State* s, uint32_t* id1, uint32_t* id2, uint32_t* id3) {
    *id3 = s->c.v[1];
    *id1 = s->c.v[2];
    *id2 = s->c.v[3];
}

// Uniform 0 to 2*32-1
inline uint32_t nrnran123_ipick(nrnran123_State* s) {
    char which = s->which_;
    uint32_t rval{s->r.v[int{which++}]};
    if (which > 3) {
        which = 0;
        s->c.v[0]++;
        s->r = coreneuron_random123_philox4x32_helper(s);
    }
    s->which_ = which;
    return rval;
}

constexpr double nrnran123_uint2dbl(uint32_t u) {
    constexpr double SHIFT32 = 1.0 / 4294967297.0; /* 1/(2^32 + 1) */
    /* 0 to 2^32-1 transforms to double value in open (0,1) interval */
    /* min 2.3283064e-10 to max (1 - 2.3283064e-10) */
    return (static_cast<double>(u) + 1.0) * SHIFT32;
}

// Uniform open interval (0,1), minimum value is 2.3283064e-10 and max value is 1-min
inline double nrnran123_dblpick(nrnran123_State* s) {
    return nrnran123_uint2dbl(nrnran123_ipick(s));
}

/* this could be called from openacc parallel construct (in INITIAL block) */
inline void nrnran123_setseq(nrnran123_State* s, uint32_t seq, char which) {
    if (which > 3) {
        s->which_ = 0;
    } else {
        s->which_ = which;
    }
    s->c.v[0] = seq;
    s->r = coreneuron_random123_philox4x32_helper(s);
}

// nrnran123_negexp min value is 2.3283064e-10, max is 22.18071, mean 1.0
inline double nrnran123_negexp(nrnran123_State* s) {
    return -std::log(nrnran123_dblpick(s));
}

/* at cost of a cached  value we could compute two at a time. */
inline double nrnran123_normal(nrnran123_State* s) {
    double w, u1;
    do {
        u1 = nrnran123_dblpick(s);
        double u2{nrnran123_dblpick(s)};
        u1 = 2. * u1 - 1.;
        u2 = 2. * u2 - 1.;
        w = (u1 * u1) + (u2 * u2);
    } while (w > 1);
    double y{std::sqrt((-2. * std::log(w)) / w)};
    return u1 * y;
}

// nrnran123_gauss, nrnran123_iran were declared but not defined in CoreNEURON
// nrnran123_array4x32 was declared but not used in CoreNEURON
}  // namespace coreneuron


================================================
FILE: coreneuron/utils/string_utils.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

#include <cstring>

unsigned strcat_at_pos(char* dest, unsigned start_position, char* src, unsigned src_length) {
    memcpy(dest + start_position, src, src_length);
    dest[start_position + src_length] = '\0';
    return start_position + src_length;
}


================================================
FILE: coreneuron/utils/string_utils.h
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/

/**
 * @file string_utils.h
 * @brief Utility functions for strings
 *
 */

#pragma once

/** @brief Appends a copy of the source string to the destination string.
 *
 *  A null-character is included at the end of the new string formed by the concatenation of both in
 * destination. It has similar behavior to strcat but better performance in case that it is needed
 * to append a char array to another very large char array.
 *
 *  @param dest Destination string
 *  @param start_position Position of dest to start writing src
 *  @param src Source string
 *  @param src_length Length of src to append to dest
 *  @return Position of the final character of dest after appending src (including the null
 * terminating character)
 */
unsigned strcat_at_pos(char* dest, unsigned start_position, char* src, unsigned src_length);


================================================
FILE: coreneuron/utils/units.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
*/
#pragma once
namespace coreneuron {
namespace units {
#if CORENEURON_USE_LEGACY_UNITS == 1
constexpr double faraday{96485.309};
constexpr double gasconstant{8.3134};
#else
/* NMODL translated MOD files get unit constants typically from
 * share/lib/nrnunits.lib.in. But there were other source files that hardcode
 * some of the constants. Here we gather a few modern units into a single place
 * (but, unfortunately, also in nrnunits.lib.in). Legacy units cannot be
 * gathered here because they can differ slightly from place to place.
 *
 * These come from https://physics.nist.gov/cuu/Constants/index.html.
 * Termed the "2018 CODATA recommended values", they became available
 * on 20 May 2019 and replace the 2014 CODATA set.
 *
 * See oc/hoc_init.c, nrnoc/eion.c, nrniv/kschan.h
 */
namespace detail {
constexpr double electron_charge{1.602176634e-19};  // coulomb exact
constexpr double avogadro_number{6.02214076e+23};   // exact
constexpr double boltzmann{1.380649e-23};           // joule/K exact
}  // namespace detail
constexpr double faraday{detail::electron_charge * detail::avogadro_number};  // 96485.33212...
                                                                              // coulomb/mol
constexpr double gasconstant{detail::boltzmann * detail::avogadro_number};    // 8.314462618...
                                                                              // joule/mol-K
#endif
}  // namespace units
}  // namespace coreneuron


================================================
FILE: coreneuron/utils/utils.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2021-22 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/
#include <sys/time.h>
#include "utils.hpp"
#include "coreneuron/apps/corenrn_parameters.hpp"

namespace coreneuron {
[[noreturn]] void nrn_abort(int errcode) {
#if NRNMPI
    if (corenrn_param.mpi_enable && nrnmpi_initialized()) {
        nrnmpi_abort(errcode);
    }
#endif
    std::abort();
}

double nrn_wtime() {
#if NRNMPI
    if (corenrn_param.mpi_enable) {
        return nrnmpi_wtime();
    } else
#endif
    {
        struct timeval time1;
        gettimeofday(&time1, nullptr);
        return (time1.tv_sec + time1.tv_usec / 1.e6);
    }
}
}  // namespace coreneuron


================================================
FILE: coreneuron/utils/utils.hpp
================================================
/*
# =============================================================================
# Copyright (c) 2021-22 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#pragma once

#include <utility>
#include "coreneuron/mpi/nrnmpi.h"
#include "coreneuron/mpi/core/nrnmpi.hpp"

namespace coreneuron {
[[noreturn]] void nrn_abort(int errcode);
template <typename... Args>
void nrn_fatal_error(const char* msg, Args&&... args) {
    if (nrnmpi_myid == 0) {
        printf(msg, std::forward<Args>(args)...);
    }
    nrn_abort(-1);
}
extern double nrn_wtime(void);
}  // namespace coreneuron


================================================
FILE: coreneuron/utils/utils_cuda.h
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#pragma once

#include <stdio.h>
#include <cuda_runtime_api.h>

// From Random123 lib
#define CHECKLAST(MSG)                             \
    do {                                           \
        cudaError_t e = cudaGetLastError();        \
        if (e != cudaSuccess) {                    \
            fprintf(stderr,                        \
                    "%s:%d: CUDA Error: %s: %s\n", \
                    __FILE__,                      \
                    __LINE__,                      \
                    (MSG),                         \
                    cudaGetErrorString(e));        \
            exit(1);                               \
        }                                          \
    } while (0)
#define CHECKCALL(RET)                                                                             \
    do {                                                                                           \
        cudaError_t e = (RET);                                                                     \
        if (e != cudaSuccess) {                                                                    \
            fprintf(stderr, "%s:%d: CUDA Error: %s\n", __FILE__, __LINE__, cudaGetErrorString(e)); \
            exit(1);                                                                               \
        }                                                                                          \
    } while (0)


================================================
FILE: coreneuron/utils/vrecitem.h
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#pragma once

#include "coreneuron/network/netcon.hpp"
#include "coreneuron/utils/ivocvect.hpp"
namespace coreneuron {
class PlayRecord;

#define PlayRecordType        0
#define VecPlayContinuousType 4
#define PlayRecordEventType   21

// used by PlayRecord subclasses that utilize discrete events
class PlayRecordEvent: public DiscreteEvent {
  public:
    PlayRecordEvent() = default;
    virtual ~PlayRecordEvent() = default;
    virtual void deliver(double, NetCvode*, NrnThread*) override;
    virtual void pr(const char*, double t, NetCvode*) override;
    virtual NrnThread* thread();
    PlayRecord* plr_;
    static unsigned long playrecord_send_;
    static unsigned long playrecord_deliver_;
    virtual int type() const override {
        return PlayRecordEventType;
    }
};

// common interface for Play and Record for all integration methods.
class PlayRecord {
  public:
    PlayRecord(double* pd, int ith);
    virtual ~PlayRecord() = default;
    virtual void play_init() {}  // called near beginning of finitialize
    virtual void continuous(double) {
    }  // play - every f(y, t) or res(y', y, t); record - advance_tn and initialize flag
    virtual void deliver(double, NetCvode*) {}  // at associated DiscreteEvent
    virtual PlayRecordEvent* event() {
        return nullptr;
    }
    virtual void pr();  // print identifying info
    virtual int type() const {
        return PlayRecordType;
    }

    double* pd_;
    int ith_;  // The thread index
};

class VecPlayContinuous: public PlayRecord {
  public:
    VecPlayContinuous(double*, IvocVect&& yvec, IvocVect&& tvec, IvocVect* discon, int ith);
    virtual ~VecPlayContinuous();
    virtual void play_init() override;
    virtual void deliver(double tt, NetCvode*) override;
    virtual PlayRecordEvent* event() override {
        return e_;
    }
    virtual void pr() override;

    void continuous(double tt) override;
    double interpolate(double tt);
    double interp(double th, double x0, double x1) {
        return x0 + (x1 - x0) * th;
    }
    void search(double tt);

    virtual int type() const override {
        return VecPlayContinuousType;
    }

    IvocVect y_;
    IvocVect t_;
    IvocVect* discon_indices_;
    std::size_t last_index_{};
    std::size_t discon_index_{};
    std::size_t ubound_index_{};

    PlayRecordEvent* e_ = nullptr;  // Need to be a raw pointer for acc
};
}  // namespace coreneuron


================================================
FILE: coreneuron/utils/vrecord.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/

#include <cstdio>

#include "coreneuron/nrnconf.h"
#include "coreneuron/sim/multicore.hpp"
#include "coreneuron/utils/ivocvect.hpp"
#include "coreneuron/network/netcvode.hpp"
#include "coreneuron/utils/vrecitem.h"
namespace coreneuron {
extern NetCvode* net_cvode_instance;

void PlayRecordEvent::deliver(double tt, NetCvode* ns, NrnThread*) {
    plr_->deliver(tt, ns);
}

NrnThread* PlayRecordEvent::thread() {
    return nrn_threads + plr_->ith_;
}

void PlayRecordEvent::pr(const char* s, double tt, NetCvode*) {
    printf("%s PlayRecordEvent %.15g ", s, tt);
    plr_->pr();
}

PlayRecord::PlayRecord(double* pd, int ith)
    : pd_(pd)
    , ith_(ith) {}

void PlayRecord::pr() {
    printf("PlayRecord\n");
}

VecPlayContinuous::VecPlayContinuous(double* pd,
                                     IvocVect&& yvec,
                                     IvocVect&& tvec,
                                     IvocVect* discon,
                                     int ith)
    : PlayRecord(pd, ith)
    , y_(std::move(yvec))
    , t_(std::move(tvec))
    , discon_indices_(discon)
    , e_(new PlayRecordEvent{}) {
    e_->plr_ = this;
}

VecPlayContinuous::~VecPlayContinuous() {
    delete e_;
}

void VecPlayContinuous::play_init() {
    NrnThread* nt = nrn_threads + ith_;
    last_index_ = 0;
    discon_index_ = 0;
    if (discon_indices_) {
        if (discon_indices_->size() > 0) {
            ubound_index_ = (int) (*discon_indices_)[discon_index_++];
            // printf("play_init %d %g\n", ubound_index_, t_->elem(ubound_index_));
            e_->send(t_[ubound_index_], net_cvode_instance, nt);
        } else {
            ubound_index_ = t_.size() - 1;
        }
    } else {
        ubound_index_ = 0;
        e_->send(t_[ubound_index_], net_cvode_instance, nt);
    }
}

void VecPlayContinuous::deliver(double tt, NetCvode* ns) {
    NrnThread* nt = nrn_threads + ith_;
    // printf("deliver %g\n", tt);
    last_index_ = ubound_index_;
    // clang-format off

    nrn_pragma_acc(update device(last_index_) if (nt->compute_gpu))
    nrn_pragma_omp(target update to(last_index_) if (nt->compute_gpu))
    // clang-format on
    if (discon_indices_) {
        if (discon_index_ < discon_indices_->size()) {
            ubound_index_ = (int) (*discon_indices_)[discon_index_++];
            // printf("after deliver:send %d %g\n", ubound_index_, t_->elem(ubound_index_));
            e_->send(t_[ubound_index_], ns, nt);
        } else {
            ubound_index_ = t_.size() - 1;
        }
    } else {
        if (ubound_index_ < t_.size() - 1) {
            ubound_index_++;
            e_->send(t_[ubound_index_], ns, nt);
        }
    }
    // clang-format off

    nrn_pragma_acc(update device(ubound_index_) if (nt->compute_gpu))
    nrn_pragma_omp(target update to(ubound_index_) if (nt->compute_gpu))
    // clang-format on
    continuous(tt);
}

void VecPlayContinuous::continuous(double tt) {
#ifdef CORENEURON_ENABLE_GPU
    NrnThread* nt = nrn_threads + ith_;
#endif
    // clang-format off

    nrn_pragma_acc(kernels present(this) if(nt->compute_gpu))
    nrn_pragma_omp(target if(nt->compute_gpu))
    {
        *pd_ = interpolate(tt);
    }
    // clang-format on
}

double VecPlayContinuous::interpolate(double tt) {
    if (tt >= t_[ubound_index_]) {
        last_index_ = ubound_index_;
        if (last_index_ == 0) {
            // printf("return last tt=%g ubound=%g y=%g\n", tt, t_->elem(ubound_index_),
            // y_->elem(last_index_));
            return y_[last_index_];
        }
    } else if (tt <= t_[0]) {
        last_index_ = 0;
        // printf("return elem(0) tt=%g t0=%g y=%g\n", tt, t_->elem(0), y_->elem(0));
        return y_[0];
    } else {
        search(tt);
    }
    double x0 = y_[last_index_ - 1];
    double x1 = y_[last_index_];
    double t0 = t_[last_index_ - 1];
    double t1 = t_[last_index_];
    // printf("IvocVectRecorder::continuous tt=%g t0=%g t1=%g theta=%g x0=%g x1=%g\n", tt, t0, t1,
    // (tt - t0)/(t1 - t0), x0, x1);
    if (t0 == t1) {
        return (x0 + x1) / 2.;
    }
    return interp((tt - t0) / (t1 - t0), x0, x1);
}

void VecPlayContinuous::search(double tt) {
    //	assert (tt > t_->elem(0) && tt < t_->elem(t_->size() - 1))
    while (tt < t_[last_index_]) {
        --last_index_;
    }
    while (tt >= t_[last_index_]) {
        ++last_index_;
    }
}

void VecPlayContinuous::pr() {
    printf("VecPlayContinuous ");
    // printf("%s.x[%d]\n", hoc_object_name(y_->obj_), last_index_);
}
}  // namespace coreneuron


================================================
FILE: docs/Doxyfile.in
================================================
# Doxyfile 1.8.15

# This file describes the settings to be used by the documentation system
# doxygen (www.doxygen.org) for a project.
#
# All text after a double hash (##) is considered a comment and is placed in
# front of the TAG it is preceding.
#
# All text after a single hash (#) is considered a comment and will be ignored.
# The format is:
# TAG = value [value, ...]
# For lists, items can also be appended using:
# TAG += value [value, ...]
# Values that contain spaces should be placed between quotes (\" \").

#---------------------------------------------------------------------------
# Project related configuration options
#---------------------------------------------------------------------------

# This tag specifies the encoding used for all characters in the configuration
# file that follow. The default is UTF-8 which is also the encoding used for all
# text before the first occurrence of this tag. Doxygen uses libiconv (or the
# iconv built into libc) for the transcoding. See
# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
# The default value is: UTF-8.

DOXYFILE_ENCODING      = UTF-8

# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
# double-quotes, unless you are using Doxywizard) that should identify the
# project for which the documentation is generated. This name is used in the
# title of most generated pages and in a few other places.
# The default value is: My Project.

PROJECT_NAME           = "CoreNEURON"

# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
# could be handy for archiving the generated documentation or if some version
# control system is used.

PROJECT_NUMBER         =

# Using the PROJECT_BRIEF tag one can provide an optional one line description
# for a project that appears at the top of each page and should give viewer a
# quick idea about the purpose of the project. Keep the description short.

PROJECT_BRIEF          =

# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
# in the documentation. The maximum height of the logo should not exceed 55
# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
# the logo to the output directory.

#PROJECT_LOGO           = @PROJECT_SOURCE_DIR@/docs/logo.png

# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
# into which the generated documentation will be written. If a relative path is
# entered, it will be relative to the location where doxygen was started. If
# left blank the current directory will be used.

OUTPUT_DIRECTORY       = @CMAKE_CURRENT_BINARY_DIR@/docs

# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
# directories (in 2 levels) under the output directory of each output format and
# will distribute the generated files over these directories. Enabling this
# option can be useful when feeding doxygen a huge amount of source files, where
# putting all generated files in the same directory would otherwise causes
# performance problems for the file system.
# The default value is: NO.

CREATE_SUBDIRS         = NO

# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
# characters to appear in the names of generated files. If set to NO, non-ASCII
# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
# U+3044.
# The default value is: NO.

ALLOW_UNICODE_NAMES    = NO

# The OUTPUT_LANGUAGE tag is used to specify the language in which all
# documentation generated by doxygen is written. Doxygen will use this
# information to generate all constant output in the proper language.
# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
# Ukrainian and Vietnamese.
# The default value is: English.

OUTPUT_LANGUAGE        = English

# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all
# documentation generated by doxygen is written. Doxygen will use this
# information to generate all generated output in the proper direction.
# Possible values are: None, LTR, RTL and Context.
# The default value is: None.

OUTPUT_TEXT_DIRECTION  = None

# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
# descriptions after the members that are listed in the file and class
# documentation (similar to Javadoc). Set to NO to disable this.
# The default value is: YES.

BRIEF_MEMBER_DESC      = YES

# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
# description of a member or function before the detailed description
#
# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
# brief descriptions will be completely suppressed.
# The default value is: YES.

REPEAT_BRIEF           = YES

# This tag implements a quasi-intelligent brief description abbreviator that is
# used to form the text in various listings. Each string in this list, if found
# as the leading text of the brief description, will be stripped from the text
# and the result, after processing the whole list, is used as the annotated
# text. Otherwise, the brief description is used as-is. If left blank, the
# following values are used ($name is automatically replaced with the name of
# the entity):The $name class, The $name widget, The $name file, is, provides,
# specifies, contains, represents, a, an and the.

ABBREVIATE_BRIEF       = "The $name class" \
                         "The $name widget" \
                         "The $name file" \
                         is \
                         provides \
                         specifies \
                         contains \
                         represents \
                         a \
                         an \
                         the

# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
# doxygen will generate a detailed section even if there is only a brief
# description.
# The default value is: NO.

ALWAYS_DETAILED_SEC    = NO

# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
# inherited members of a class in the documentation of that class as if those
# members were ordinary class members. Constructors, destructors and assignment
# operators of the base classes will not be shown.
# The default value is: NO.

INLINE_INHERITED_MEMB  = NO

# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
# before files name in the file list and in the header files. If set to NO the
# shortest path that makes the file name unique will be used
# The default value is: YES.

FULL_PATH_NAMES        = YES

# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
# Stripping is only done if one of the specified strings matches the left-hand
# part of the path. The tag can be used to show relative paths in the file list.
# If left blank the directory from which doxygen is run is used as the path to
# strip.
#
# Note that you can specify absolute paths here, but also relative paths, which
# will be relative from the directory where doxygen is started.
# This tag requires that the tag FULL_PATH_NAMES is set to YES.

STRIP_FROM_PATH        =

# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
# path mentioned in the documentation of a class, which tells the reader which
# header file to include in order to use a class. If left blank only the name of
# the header file containing the class definition is used. Otherwise one should
# specify the list of include paths that are normally passed to the compiler
# using the -I flag.

STRIP_FROM_INC_PATH    =

# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
# less readable) file names. This can be useful is your file systems doesn't
# support long names like on DOS, Mac, or CD-ROM.
# The default value is: NO.

SHORT_NAMES            = NO

# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
# first line (until the first dot) of a Javadoc-style comment as the brief
# description. If set to NO, the Javadoc-style will behave just like regular Qt-
# style comments (thus requiring an explicit @brief command for a brief
# description.)
# The default value is: NO.

JAVADOC_AUTOBRIEF      = YES

# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
# line (until the first dot) of a Qt-style comment as the brief description. If
# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
# requiring an explicit \brief command for a brief description.)
# The default value is: NO.

QT_AUTOBRIEF           = NO

# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
# a brief description. This used to be the default behavior. The new default is
# to treat a multi-line C++ comment block as a detailed description. Set this
# tag to YES if you prefer the old behavior instead.
#
# Note that setting this tag to YES also means that rational rose comments are
# not recognized any more.
# The default value is: NO.

MULTILINE_CPP_IS_BRIEF = NO

# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
# documentation from any documented member that it re-implements.
# The default value is: YES.

INHERIT_DOCS           = YES

# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
# page for each member. If set to NO, the documentation of a member will be part
# of the file/class/namespace that contains it.
# The default value is: NO.

SEPARATE_MEMBER_PAGES  = NO

# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
# uses this value to replace tabs by spaces in code fragments.
# Minimum value: 1, maximum value: 16, default value: 4.

TAB_SIZE               = 4

# This tag can be used to specify a number of aliases that act as commands in
# the documentation. An alias has the form:
# name=value
# For example adding
# "sideeffect=@par Side Effects:\n"
# will allow you to put the command \sideeffect (or @sideeffect) in the
# documentation, which will result in a user-defined paragraph with heading
# "Side Effects:". You can put \n's in the value part of an alias to insert
# newlines (in the resulting output). You can put ^^ in the value part of an
# alias to insert a newline as if a physical newline was in the original file.
# When you need a literal { or } or , in the value part of an alias you have to
# escape them by means of a backslash (\), this can lead to conflicts with the
# commands \{ and \} for these it is advised to use the version @{ and @} or use
# a double escape (\\{ and \\})

ALIASES                =

# This tag can be used to specify a number of word-keyword mappings (TCL only).
# A mapping has the form "name=value". For example adding "class=itcl::class"
# will allow you to use the command class in the itcl::class meaning.

TCL_SUBST              =

# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
# only. Doxygen will then generate output that is more tailored for C. For
# instance, some of the names that are used will be different. The list of all
# members will be omitted, etc.
# The default value is: NO.

OPTIMIZE_OUTPUT_FOR_C  = NO

# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
# Python sources only. Doxygen will then generate output that is more tailored
# for that language. For instance, namespaces will be presented as packages,
# qualified scopes will look different, etc.
# The default value is: NO.

OPTIMIZE_OUTPUT_JAVA   = NO

# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
# sources. Doxygen will then generate output that is tailored for Fortran.
# The default value is: NO.

OPTIMIZE_FOR_FORTRAN   = NO

# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
# sources. Doxygen will then generate output that is tailored for VHDL.
# The default value is: NO.

OPTIMIZE_OUTPUT_VHDL   = NO

# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice
# sources only. Doxygen will then generate output that is more tailored for that
# language. For instance, namespaces will be presented as modules, types will be
# separated into more groups, etc.
# The default value is: NO.

OPTIMIZE_OUTPUT_SLICE  = NO

# Doxygen selects the parser to use depending on the extension of the files it
# parses. With this tag you can assign which parser to use for a given
# extension. Doxygen has a built-in mapping, but you can override or extend it
# using this tag. The format is ext=language, where ext is a file extension, and
# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice,
# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
# tries to guess whether the code is fixed or free formatted code, this is the
# default for Fortran type files), VHDL, tcl. For instance to make doxygen treat
# .inc files as Fortran files (default is PHP), and .f files as C (default is
# Fortran), use: inc=Fortran f=C.
#
# Note: For files without extension you can use no_extension as a placeholder.
#
# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
# the files are not read by doxygen.

EXTENSION_MAPPING      = .yaml=Python

# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
# according to the Markdown format, which allows for more readable
# documentation. See https://daringfireball.net/projects/markdown/ for details.
# The output of markdown processing is further processed by doxygen, so you can
# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
# case of backward compatibilities issues.
# The default value is: YES.

MARKDOWN_SUPPORT       = YES

# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
# to that level are automatically included in the table of contents, even if
# they do not have an id attribute.
# Note: This feature currently applies only to Markdown headings.
# Minimum value: 0, maximum value: 99, default value: 0.
# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.

TOC_INCLUDE_HEADINGS   = 0

# When enabled doxygen tries to link words that correspond to documented
# classes, or namespaces to their corresponding documentation. Such a link can
# be prevented in individual cases by putting a % sign in front of the word or
# globally by setting AUTOLINK_SUPPORT to NO.
# The default value is: YES.

AUTOLINK_SUPPORT       = YES

# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
# to include (a tag file for) the STL sources as input, then you should set this
# tag to YES in order to let doxygen match functions declarations and
# definitions whose arguments contain STL classes (e.g. func(std::string);
# versus func(std::string) {}). This also make the inheritance and collaboration
# diagrams that involve STL classes more complete and accurate.
# The default value is: NO.

BUILTIN_STL_SUPPORT    = YES

# If you use Microsoft's C++/CLI language, you should set this option to YES to
# enable parsing support.
# The default value is: NO.

CPP_CLI_SUPPORT        = NO

# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
# will parse them like normal C++ but will assume all classes use public instead
# of private inheritance when no explicit protection keyword is present.
# The default value is: NO.

SIP_SUPPORT            = NO

# For Microsoft's IDL there are propget and propput attributes to indicate
# getter and setter methods for a property. Setting this option to YES will make
# doxygen to replace the get and set methods by a property in the documentation.
# This will only work if the methods are indeed getting or setting a simple
# type. If this is not the case, or you want to show the methods anyway, you
# should set this option to NO.
# The default value is: YES.

IDL_PROPERTY_SUPPORT   = YES

# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
# tag is set to YES then doxygen will reuse the documentation of the first
# member in the group (if any) for the other members of the group. By default
# all members of a group must be documented explicitly.
# The default value is: NO.

DISTRIBUTE_GROUP_DOC   = NO

# If one adds a struct or class to a group and this option is enabled, then also
# any nested class or struct is added to the same group. By default this option
# is disabled and one has to add nested compounds explicitly via \ingroup.
# The default value is: NO.

GROUP_NESTED_COMPOUNDS = NO

# Set the SUBGROUPING tag to YES to allow class member groups of the same type
# (for instance a group of public functions) to be put as a subgroup of that
# type (e.g. under the Public Functions section). Set it to NO to prevent
# subgrouping. Alternatively, this can be done per class using the
# \nosubgrouping command.
# The default value is: YES.

SUBGROUPING            = YES

# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
# are shown inside the group in which they are included (e.g. using \ingroup)
# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
# and RTF).
#
# Note that this feature does not work in combination with
# SEPARATE_MEMBER_PAGES.
# The default value is: NO.

INLINE_GROUPED_CLASSES = NO

# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
# with only public data fields or simple typedef fields will be shown inline in
# the documentation of the scope in which they are defined (i.e. file,
# namespace, or group documentation), provided this scope is documented. If set
# to NO, structs, classes, and unions are shown on a separate page (for HTML and
# Man pages) or section (for LaTeX and RTF).
# The default value is: NO.

INLINE_SIMPLE_STRUCTS  = NO

# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
# enum is documented as struct, union, or enum with the name of the typedef. So
# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
# with name TypeT. When disabled the typedef will appear as a member of a file,
# namespace, or class. And the struct will be named TypeS. This can typically be
# useful for C code in case the coding convention dictates that all compound
# types are typedef'ed and only the typedef is referenced, never the tag name.
# The default value is: NO.

TYPEDEF_HIDES_STRUCT   = NO

# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
# cache is used to resolve symbols given their name and scope. Since this can be
# an expensive process and often the same symbol appears multiple times in the
# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
# doxygen will become slower. If the cache is too large, memory is wasted. The
# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
# symbols. At the end of a run doxygen will report the cache usage and suggest
# the optimal cache size from a speed point of view.
# Minimum value: 0, maximum value: 9, default value: 0.

LOOKUP_CACHE_SIZE      = 0

#---------------------------------------------------------------------------
# Build related configuration options
#---------------------------------------------------------------------------

# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
# documentation are documented, even if no documentation was available. Private
# class members and static file members will be hidden unless the
# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
# Note: This will also disable the warnings about undocumented members that are
# normally produced when WARNINGS is set to YES.
# The default value is: NO.

EXTRACT_ALL            = YES

# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
# be included in the documentation.
# The default value is: NO.

EXTRACT_PRIVATE        = YES

# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
# scope will be included in the documentation.
# The default value is: NO.

EXTRACT_PACKAGE        = YES

# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
# included in the documentation.
# The default value is: NO.

EXTRACT_STATIC         = YES

# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
# locally in source files will be included in the documentation. If set to NO,
# only classes defined in header files are included. Does not have any effect
# for Java sources.
# The default value is: YES.

EXTRACT_LOCAL_CLASSES  = YES

# This flag is only useful for Objective-C code. If set to YES, local methods,
# which are defined in the implementation section but not in the interface are
# included in the documentation. If set to NO, only methods in the interface are
# included.
# The default value is: NO.

EXTRACT_LOCAL_METHODS  = NO

# If this flag is set to YES, the members of anonymous namespaces will be
# extracted and appear in the documentation as a namespace called
# 'anonymous_namespace{file}', where file will be replaced with the base name of
# the file that contains the anonymous namespace. By default anonymous namespace
# are hidden.
# The default value is: NO.

EXTRACT_ANON_NSPACES   = NO

# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
# undocumented members inside documented classes or files. If set to NO these
# members will be included in the various overviews, but no documentation
# section is generated. This option has no effect if EXTRACT_ALL is enabled.
# The default value is: NO.

HIDE_UNDOC_MEMBERS     = NO

# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
# undocumented classes that are normally visible in the class hierarchy. If set
# to NO, these classes will be included in the various overviews. This option
# has no effect if EXTRACT_ALL is enabled.
# The default value is: NO.

HIDE_UNDOC_CLASSES     = NO

# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
# (class|struct|union) declarations. If set to NO, these declarations will be
# included in the documentation.
# The default value is: NO.

HIDE_FRIEND_COMPOUNDS  = NO

# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
# documentation blocks found inside the body of a function. If set to NO, these
# blocks will be appended to the function's detailed documentation block.
# The default value is: NO.

HIDE_IN_BODY_DOCS      = NO

# The INTERNAL_DOCS tag determines if documentation that is typed after a
# \internal command is included. If the tag is set to NO then the documentation
# will be excluded. Set it to YES to include the internal documentation.
# The default value is: NO.

INTERNAL_DOCS          = NO

# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
# names in lower-case letters. If set to YES, upper-case letters are also
# allowed. This is useful if you have classes or files whose names only differ
# in case and if your file system supports case sensitive file names. Windows
# and Mac users are advised to set this option to NO.
# The default value is: system dependent.

CASE_SENSE_NAMES       = NO

# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
# their full class and namespace scopes in the documentation. If set to YES, the
# scope will be hidden.
# The default value is: NO.

HIDE_SCOPE_NAMES       = NO

# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
# append additional text to a page's title, such as Class Reference. If set to
# YES the compound reference will be hidden.
# The default value is: NO.

HIDE_COMPOUND_REFERENCE= NO

# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
# the files that are included by a file in the documentation of that file.
# The default value is: YES.

SHOW_INCLUDE_FILES     = YES

# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
# grouped member an include statement to the documentation, telling the reader
# which file to include in order to use the member.
# The default value is: NO.

SHOW_GROUPED_MEMB_INC  = NO

# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
# files with double quotes in the documentation rather than with sharp brackets.
# The default value is: NO.

FORCE_LOCAL_INCLUDES   = NO

# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
# documentation for inline members.
# The default value is: YES.

INLINE_INFO            = YES

# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
# (detailed) documentation of file and class members alphabetically by member
# name. If set to NO, the members will appear in declaration order.
# The default value is: YES.

SORT_MEMBER_DOCS       = YES

# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
# descriptions of file, namespace and class members alphabetically by member
# name. If set to NO, the members will appear in declaration order. Note that
# this will also influence the order of the classes in the class list.
# The default value is: NO.

SORT_BRIEF_DOCS        = NO

# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
# (brief and detailed) documentation of class members so that constructors and
# destructors are listed first. If set to NO the constructors will appear in the
# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
# member documentation.
# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
# detailed member documentation.
# The default value is: NO.

SORT_MEMBERS_CTORS_1ST = NO

# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
# of group names into alphabetical order. If set to NO the group names will
# appear in their defined order.
# The default value is: NO.

SORT_GROUP_NAMES       = NO

# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
# fully-qualified names, including namespaces. If set to NO, the class list will
# be sorted only by class name, not including the namespace part.
# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
# Note: This option applies only to the class list, not to the alphabetical
# list.
# The default value is: NO.

SORT_BY_SCOPE_NAME     = NO

# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
# type resolution of all parameters of a function it will reject a match between
# the prototype and the implementation of a member function even if there is
# only one candidate or it is obvious which candidate to choose by doing a
# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
# accept a match between prototype and implementation in such cases.
# The default value is: NO.

STRICT_PROTO_MATCHING  = NO

# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
# list. This list is created by putting \todo commands in the documentation.
# The default value is: YES.

GENERATE_TODOLIST      = YES

# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
# list. This list is created by putting \test commands in the documentation.
# The default value is: YES.

GENERATE_TESTLIST      = YES

# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
# list. This list is created by putting \bug commands in the documentation.
# The default value is: YES.

GENERATE_BUGLIST       = YES

# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
# the deprecated list. This list is created by putting \deprecated commands in
# the documentation.
# The default value is: YES.

GENERATE_DEPRECATEDLIST= YES

# The ENABLED_SECTIONS tag can be used to enable conditional documentation
# sections, marked by \if <section_label> ... \endif and \cond <section_label>
# ... \endcond blocks.

ENABLED_SECTIONS       =

# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
# initial value of a variable or macro / define can have for it to appear in the
# documentation. If the initializer consists of more lines than specified here
# it will be hidden. Use a value of 0 to hide initializers completely. The
# appearance of the value of individual variables and macros / defines can be
# controlled using \showinitializer or \hideinitializer command in the
# documentation regardless of this setting.
# Minimum value: 0, maximum value: 10000, default value: 30.

MAX_INITIALIZER_LINES  = 30

# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
# the bottom of the documentation of classes and structs. If set to YES, the
# list will mention the files that were used to generate the documentation.
# The default value is: YES.

SHOW_USED_FILES        = YES

# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
# will remove the Files entry from the Quick Index and from the Folder Tree View
# (if specified).
# The default value is: YES.

SHOW_FILES             = YES

# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
# page. This will remove the Namespaces entry from the Quick Index and from the
# Folder Tree View (if specified).
# The default value is: YES.

SHOW_NAMESPACES        = YES

# The FILE_VERSION_FILTER tag can be used to specify a program or script that
# doxygen should invoke to get the current version for each file (typically from
# the version control system). Doxygen will invoke the program by executing (via
# popen()) the command command input-file, where command is the value of the
# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
# by doxygen. Whatever the program writes to standard output is used as the file
# version. For an example see the documentation.

FILE_VERSION_FILTER    =

# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
# by doxygen. The layout file controls the global structure of the generated
# output files in an output format independent way. To create the layout file
# that represents doxygen's defaults, run doxygen with the -l option. You can
# optionally specify a file name after the option, if omitted DoxygenLayout.xml
# will be used as the name of the layout file.
#
# Note that if you run doxygen from a directory containing a file called
# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
# tag is left empty.

LAYOUT_FILE            =  @PROJECT_SOURCE_DIR@/docs/DoxygenLayout.xml

# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
# the reference definitions. This must be a list of .bib files. The .bib
# extension is automatically appended if omitted. This requires the bibtex tool
# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
# For LaTeX the style of the bibliography can be controlled using
# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
# search path. See also \cite for info how to create references.

CITE_BIB_FILES         =

#---------------------------------------------------------------------------
# Configuration options related to warning and progress messages
#---------------------------------------------------------------------------

# The QUIET tag can be used to turn on/off the messages that are generated to
# standard output by doxygen. If QUIET is set to YES this implies that the
# messages are off.
# The default value is: NO.

QUIET                  = YES

# The WARNINGS tag can be used to turn on/off the warning messages that are
# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
# this implies that the warnings are on.
#
# Tip: Turn warnings on while writing the documentation.
# The default value is: YES.

WARNINGS               = YES

# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
# will automatically be disabled.
# The default value is: YES.

WARN_IF_UNDOCUMENTED   = YES

# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
# potential errors in the documentation, such as not documenting some parameters
# in a documented function, or documenting parameters that don't exist or using
# markup commands wrongly.
# The default value is: YES.

WARN_IF_DOC_ERROR      = YES

# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
# are documented, but have no documentation for their parameters or return
# value. If set to NO, doxygen will only warn about wrong or incomplete
# parameter documentation, but not about the absence of documentation. If
# EXTRACT_ALL is set to YES then this flag will automatically be disabled.
# The default value is: NO.

WARN_NO_PARAMDOC       = NO

# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
# a warning is encountered.
# The default value is: NO.

WARN_AS_ERROR          = NO

# The WARN_FORMAT tag determines the format of the warning messages that doxygen
# can produce. The string should contain the $file, $line, and $text tags, which
# will be replaced by the file and line number from which the warning originated
# and the warning text. Optionally the format may contain $version, which will
# be replaced by the version of the file (if it could be obtained via
# FILE_VERSION_FILTER)
# The default value is: $file:$line: $text.

WARN_FORMAT            = "$file:$line: $text"

# The WARN_LOGFILE tag can be used to specify a file to which warning and error
# messages should be written. If left blank the output is written to standard
# error (stderr).

WARN_LOGFILE           =

#---------------------------------------------------------------------------
# Configuration options related to the input files
#---------------------------------------------------------------------------

# The INPUT tag is used to specify the files and/or directories that contain
# documented source files. You may enter file names like myfile.cpp or
# directories like /usr/src/myproject. Separate the files or directories with
# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
# Note: If this tag is empty the current directory is searched.

INPUT                  = @PROJECT_SOURCE_DIR@/coreneuron
INPUT                 += @PROJECT_SOURCE_DIR@/tests

# This tag can be used to specify the character encoding of the source files
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
# documentation (see: https://www.gnu.org/software/libiconv/) for the list of
# possible encodings.
# The default value is: UTF-8.

INPUT_ENCODING         = UTF-8

# If the value of the INPUT tag contains directories, you can use the
# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
# *.h) to filter out the source-files in the directories.
#
# Note that for custom extensions or not directly supported extensions you also
# need to set EXTENSION_MAPPING for the extension otherwise the files are not
# read by doxygen.
#
# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf, *.qsf and *.ice.

FILE_PATTERNS          = *.c \
                         *.cc \
                         *.cxx \
                         *.cpp \
                         *.c++ \
                         *.ipp \
                         *.h \
                         *.hh \
                         *.hxx \
                         *.hpp \
                         *.h++ \
                         *.markdown \
                         *.md \
                         *.mm \
                         *.dox \
                         *.yaml \

# The RECURSIVE tag can be used to specify whether or not subdirectories should
# be searched for input files as well.
# The default value is: NO.

RECURSIVE              = YES

# The EXCLUDE tag can be used to specify files and/or directories that should be
# excluded from the INPUT source files. This way you can easily exclude a
# subdirectory from a directory tree whose root is specified with the INPUT tag.
#
# Note that relative paths are relative to the directory from which doxygen is
# run.


# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
# directories that are symbolic links (a Unix file system feature) are excluded
# from the input.
# The default value is: NO.

EXCLUDE_SYMLINKS       = NO

# If the value of the INPUT tag contains directories, you can use the
# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
# certain files from those directories.
#
# Note that the wildcards are matched against the file with absolute path, so to
# exclude all test directories for example use the pattern */test/*

EXCLUDE_PATTERNS       =

# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
# (namespaces, classes, functions, etc.) that should be excluded from the
# output. The symbol name can be a fully qualified name, a word, or if the
# wildcard * is used, a substring. Examples: ANamespace, AClass,
# AClass::ANamespace, ANamespace::*Test
#
# Note that the wildcards are matched against the file with absolute path, so to
# exclude all test directories use the pattern */test/*

EXCLUDE_SYMBOLS        =

# The EXAMPLE_PATH tag can be used to specify one or more files or directories
# that contain example code fragments that are included (see the \include
# command).

EXAMPLE_PATH           =

# If the value of the EXAMPLE_PATH tag contains directories, you can use the
# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
# *.h) to filter out the source-files in the directories. If left blank all
# files are included.

EXAMPLE_PATTERNS       = *

# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
# searched for input files to be used with the \include or \dontinclude commands
# irrespective of the value of the RECURSIVE tag.
# The default value is: NO.

EXAMPLE_RECURSIVE      = NO

# The IMAGE_PATH tag can be used to specify one or more files or directories
# that contain images that are to be included in the documentation (see the
# \image command).

IMAGE_PATH             =

# The INPUT_FILTER tag can be used to specify a program that doxygen should
# invoke to filter for each input file. Doxygen will invoke the filter program
# by executing (via popen()) the command:
#
# <filter> <input-file>
#
# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
# name of an input file. Doxygen will then use the output that the filter
# program writes to standard output. If FILTER_PATTERNS is specified, this tag
# will be ignored.
#
# Note that the filter must not add or remove lines; it is applied before the
# code is scanned, but not when the output code is generated. If lines are added
# or removed, the anchors will not be placed correctly.
#
# Note that for custom extensions or not directly supported extensions you also
# need to set EXTENSION_MAPPING for the extension otherwise the files are not
# properly processed by doxygen.

INPUT_FILTER           =

# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
# basis. Doxygen will compare the file name with each pattern and apply the
# filter if there is a match. The filters are a list of the form: pattern=filter
# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
# patterns match the file name, INPUT_FILTER is applied.
#
# Note that for custom extensions or not directly supported extensions you also
# need to set EXTENSION_MAPPING for the extension otherwise the files are not
# properly processed by doxygen.

FILTER_PATTERNS        =

# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
# INPUT_FILTER) will also be used to filter the input files that are used for
# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
# The default value is: NO.

FILTER_SOURCE_FILES    = NO

# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
# it is also possible to disable source filtering for a specific pattern using
# *.ext= (so without naming a filter).
# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.

FILTER_SOURCE_PATTERNS =

# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
# is part of the input, its contents will be placed on the main page
# (index.html). This can be useful if you have a project on for instance GitHub
# and want to reuse the introduction page also for the doxygen output.

INPUT += ../README.md
USE_MDFILE_AS_MAINPAGE = ../README.md

#---------------------------------------------------------------------------
# Configuration options related to source browsing
#---------------------------------------------------------------------------

# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
# generated. Documented entities will be cross-referenced with these sources.
#
# Note: To get rid of all source code in the generated output, make sure that
# also VERBATIM_HEADERS is set to NO.
# The default value is: NO.

SOURCE_BROWSER         = YES

# Setting the INLINE_SOURCES tag to YES will include the body of functions,
# classes and enums directly into the documentation.
# The default value is: NO.

INLINE_SOURCES         = NO

# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
# special comment blocks from generated source code fragments. Normal C, C++ and
# Fortran comments will always remain visible.
# The default value is: YES.

STRIP_CODE_COMMENTS    = NO

# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
# entity all documented functions referencing it will be listed.
# The default value is: NO.

REFERENCED_BY_RELATION = NO

# If the REFERENCES_RELATION tag is set to YES then for each documented function
# all documented entities called/used by that function will be listed.
# The default value is: NO.

REFERENCES_RELATION    = NO

# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
# to YES then the hyperlinks from functions in REFERENCES_RELATION and
# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
# link to the documentation.
# The default value is: YES.

REFERENCES_LINK_SOURCE = YES

# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
# source code will show a tooltip with additional information such as prototype,
# brief description and links to the definition and documentation. Since this
# will make the HTML file larger and loading of large files a bit slower, you
# can opt to disable this feature.
# The default value is: YES.
# This tag requires that the tag SOURCE_BROWSER is set to YES.

SOURCE_TOOLTIPS        = YES

# If the USE_HTAGS tag is set to YES then the references to source code will
# point to the HTML generated by the htags(1) tool instead of doxygen built-in
# source browser. The htags tool is part of GNU's global source tagging system
# (see https://www.gnu.org/software/global/global.html). You will need version
# 4.8.6 or higher.
#
# To use it do the following:
# - Install the latest version of global
# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file
# - Make sure the INPUT points to the root of the source tree
# - Run doxygen as normal
#
# Doxygen will invoke htags (and that will in turn invoke gtags), so these
# tools must be available from the command line (i.e. in the search path).
#
# The result: instead of the source browser generated by doxygen, the links to
# source code will now point to the output of htags.
# The default value is: NO.
# This tag requires that the tag SOURCE_BROWSER is set to YES.

USE_HTAGS              = NO

# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
# verbatim copy of the header file for each class for which an include is
# specified. Set to NO to disable this.
# See also: Section \class.
# The default value is: YES.

VERBATIM_HEADERS       = YES

#---------------------------------------------------------------------------
# Configuration options related to the alphabetical class index
#---------------------------------------------------------------------------

# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
# compounds will be generated. Enable this if the project contains a lot of
# classes, structs, unions or interfaces.
# The default value is: YES.

ALPHABETICAL_INDEX     = YES

# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
# which the alphabetical index list will be split.
# Minimum value: 1, maximum value: 20, default value: 5.
# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.

COLS_IN_ALPHA_INDEX    = 5

# In case all classes in a project start with a common prefix, all classes will
# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
# can be used to specify a prefix (or a list of prefixes) that should be ignored
# while generating the index headers.
# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.

IGNORE_PREFIX          =

#---------------------------------------------------------------------------
# Configuration options related to the HTML output
#---------------------------------------------------------------------------

# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
# The default value is: YES.

GENERATE_HTML          = YES

# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
# it.
# The default directory is: html.
# This tag requires that the tag GENERATE_HTML is set to YES.

HTML_OUTPUT            = doxygen

# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
# generated HTML page (for example: .htm, .php, .asp).
# The default value is: .html.
# This tag requires that the tag GENERATE_HTML is set to YES.

HTML_FILE_EXTENSION    = .html

# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
# each generated HTML page. If the tag is left blank doxygen will generate a
# standard header.
#
# To get valid HTML the header file that includes any scripts and style sheets
# that doxygen needs, which is dependent on the configuration options used (e.g.
# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
# default header using
# doxygen -w html new_header.html new_footer.html new_stylesheet.css
# YourConfigFile
# and then modify the file new_header.html. See also section "Doxygen usage"
# for information on how to generate the default header that doxygen normally
# uses.
# Note: The header is subject to change so you typically have to regenerate the
# default header when upgrading to a newer version of doxygen. For a description
# of the possible markers and block names see the documentation.
# This tag requires that the tag GENERATE_HTML is set to YES.

HTML_HEADER            =

# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
# generated HTML page. If the tag is left blank doxygen will generate a standard
# footer. See HTML_HEADER for more information on how to generate a default
# footer and what special commands can be used inside the footer. See also
# section "Doxygen usage" for information on how to generate the default footer
# that doxygen normally uses.
# This tag requires that the tag GENERATE_HTML is set to YES.

HTML_FOOTER            = @PROJECT_SOURCE_DIR@/docs/footer.html

# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
# sheet that is used by each HTML page. It can be used to fine-tune the look of
# the HTML output. If left blank doxygen will generate a default style sheet.
# See also section "Doxygen usage" for information on how to generate the style
# sheet that doxygen normally uses.
# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
# it is more robust and this tag (HTML_STYLESHEET) will in the future become
# obsolete.
# This tag requires that the tag GENERATE_HTML is set to YES.

HTML_STYLESHEET        =

# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
# cascading style sheets that are included after the standard style sheets
# created by doxygen. Using this option one can overrule certain style aspects.
# This is preferred over using HTML_STYLESHEET since it does not replace the
# standard style sheet and is therefore more robust against future updates.
# Doxygen will copy the style sheet files to the output directory.
# Note: The order of the extra style sheet files is of importance (e.g. the last
# style sheet in the list overrules the setting of the previous ones in the
# list). For an example see the documentation.
# This tag requires that the tag GENERATE_HTML is set to YES.

HTML_EXTRA_STYLESHEET  =

# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
# other source files which should be copied to the HTML output directory. Note
# that these files will be copied to the base HTML output directory. Use the
# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
# files will be copied as-is; there are no commands or markers available.
# This tag requires that the tag GENERATE_HTML is set to YES.

# HTML_EXTRA_FILES       =

# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
# will adjust the colors in the style sheet and background images according to
# this color. Hue is specified as an angle on a colorwheel, see
# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
# purple, and 360 is red again.
# Minimum value: 0, maximum value: 359, default value: 220.
# This tag requires that the tag GENERATE_HTML is set to YES.

HTML_COLORSTYLE_HUE    = 344

# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
# in the HTML output. For a value of 0 the output will use grayscales only. A
# value of 255 will produce the most vivid colors.
# Minimum value: 0, maximum value: 255, default value: 100.
# This tag requires that the tag GENERATE_HTML is set to YES.

HTML_COLORSTYLE_SAT    = 100

# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
# luminance component of the colors in the HTML output. Values below 100
# gradually make the output lighter, whereas values above 100 make the output
# darker. The value divided by 100 is the actual gamma applied, so 80 represents
# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
# change the gamma.
# Minimum value: 40, maximum value: 240, default value: 80.
# This tag requires that the tag GENERATE_HTML is set to YES.

HTML_COLORSTYLE_GAMMA  = 80

# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
# page will contain the date and time when the page was generated. Setting this
# to YES can help to show when doxygen was last run and thus if the
# documentation is up to date.
# The default value is: NO.
# This tag requires that the tag GENERATE_HTML is set to YES.

HTML_TIMESTAMP         = NO

# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
# documentation will contain a main index with vertical navigation menus that
# are dynamically created via Javascript. If disabled, the navigation index will
# consists of multiple levels of tabs that are statically embedded in every HTML
# page. Disable this option to support browsers that do not have Javascript,
# like the Qt help browser.
# The default value is: YES.
# This tag requires that the tag GENERATE_HTML is set to YES.

HTML_DYNAMIC_MENUS     = YES

# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
# documentation will contain sections that can be hidden and shown after the
# page has loaded.
# The default value is: NO.
# This tag requires that the tag GENERATE_HTML is set to YES.

HTML_DYNAMIC_SECTIONS  = NO

# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
# shown in the various tree structured indices initially; the user can expand
# and collapse entries dynamically later on. Doxygen will expand the tree to
# such a level that at most the specified number of entries are visible (unless
# a fully collapsed tree already exceeds this amount). So setting the number of
# entries 1 will produce a full collapsed tree by default. 0 is a special value
# representing an infinite number of entries and will result in a full expanded
# tree by default.
# Minimum value: 0, maximum value: 9999, default value: 100.
# This tag requires that the tag GENERATE_HTML is set to YES.

HTML_INDEX_NUM_ENTRIES = 100

# If the GENERATE_DOCSET tag is set to YES, additional index files will be
# generated that can be used as input for Apple's Xcode 3 integrated development
# environment (see: https://developer.apple.com/xcode/), introduced with OSX
# 10.5 (Leopard). To create a documentation set, doxygen will generate a
# Makefile in the HTML output directory. Running make will produce the docset in
# that directory and running make install will install the docset in
# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
# genXcode/_index.html for more information.
# The default value is: NO.
# This tag requires that the tag GENERATE_HTML is set to YES.

GENERATE_DOCSET        = NO

# This tag determines the name of the docset feed. A documentation feed provides
# an umbrella under which multiple documentation sets from a single provider
# (such as a company or product suite) can be grouped.
# The default value is: Doxygen generated docs.
# This tag requires that the tag GENERATE_DOCSET is set to YES.

DOCSET_FEEDNAME        = "Doxygen generated docs"

# This tag specifies a string that should uniquely identify the documentation
# set bundle. This should be a reverse domain-name style string, e.g.
# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
# The default value is: org.doxygen.Project.
# This tag requires that the tag GENERATE_DOCSET is set to YES.

DOCSET_BUNDLE_ID       = org.doxygen.Project

# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
# the documentation publisher. This should be a reverse domain-name style
# string, e.g. com.mycompany.MyDocSet.documentation.
# The default value is: org.doxygen.Publisher.
# This tag requires that the tag GENERATE_DOCSET is set to YES.

DOCSET_PUBLISHER_ID    = org.doxygen.Publisher

# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
# The default value is: Publisher.
# This tag requires that the tag GENERATE_DOCSET is set to YES.

DOCSET_PUBLISHER_NAME  = Publisher

# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
# (see: https://www.microsoft.com/en-us/download/details.aspx?id=21138) on
# Windows.
#
# The HTML Help Workshop contains a compiler that can convert all HTML output
# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
# files are now used as the Windows 98 help format, and will replace the old
# Windows help format (.hlp) on all Windows platforms in the future. Compressed
# HTML files also contain an index, a table of contents, and you can search for
# words in the documentation. The HTML workshop also contains a viewer for
# compressed HTML files.
# The default value is: NO.
# This tag requires that the tag GENERATE_HTML is set to YES.

GENERATE_HTMLHELP      = NO

# The CHM_FILE tag can be used to specify the file name of the resulting .chm
# file. You can add a path in front of the file if the result should not be
# written to the html output directory.
# This tag requires that the tag GENERATE_HTMLHELP is set to YES.

CHM_FILE               =

# The HHC_LOCATION tag can be used to specify the location (absolute path
# including file name) of the HTML help compiler (hhc.exe). If non-empty,
# doxygen will try to run the HTML help compiler on the generated index.hhp.
# The file has to be specified with full path.
# This tag requires that the tag GENERATE_HTMLHELP is set to YES.

HHC_LOCATION           =

# The GENERATE_CHI flag controls if a separate .chi index file is generated
# (YES) or that it should be included in the master .chm file (NO).
# The default value is: NO.
# This tag requires that the tag GENERATE_HTMLHELP is set to YES.

GENERATE_CHI           = NO

# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
# and project file content.
# This tag requires that the tag GENERATE_HTMLHELP is set to YES.

CHM_INDEX_ENCODING     =

# The BINARY_TOC flag controls whether a binary table of contents is generated
# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
# enables the Previous and Next buttons.
# The default value is: NO.
# This tag requires that the tag GENERATE_HTMLHELP is set to YES.

BINARY_TOC             = NO

# The TOC_EXPAND flag can be set to YES to add extra items for group members to
# the table of contents of the HTML help documentation and to the tree view.
# The default value is: NO.
# This tag requires that the tag GENERATE_HTMLHELP is set to YES.

TOC_EXPAND             = NO

# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
# (.qch) of the generated HTML documentation.
# The default value is: NO.
# This tag requires that the tag GENERATE_HTML is set to YES.

GENERATE_QHP           = NO

# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
# the file name of the resulting .qch file. The path specified is relative to
# the HTML output folder.
# This tag requires that the tag GENERATE_QHP is set to YES.

QCH_FILE               =

# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
# Project output. For more information please see Qt Help Project / Namespace
# (see: http://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
# The default value is: org.doxygen.Project.
# This tag requires that the tag GENERATE_QHP is set to YES.

QHP_NAMESPACE          = org.doxygen.Project

# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
# Help Project output. For more information please see Qt Help Project / Virtual
# Folders (see: http://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-
# folders).
# The default value is: doc.
# This tag requires that the tag GENERATE_QHP is set to YES.

QHP_VIRTUAL_FOLDER     = doc

# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
# filter to add. For more information please see Qt Help Project / Custom
# Filters (see: http://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-
# filters).
# This tag requires that the tag GENERATE_QHP is set to YES.

QHP_CUST_FILTER_NAME   =

# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
# custom filter to add. For more information please see Qt Help Project / Custom
# Filters (see: http://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-
# filters).
# This tag requires that the tag GENERATE_QHP is set to YES.

QHP_CUST_FILTER_ATTRS  =

# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
# project's filter section matches. Qt Help Project / Filter Attributes (see:
# http://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes).
# This tag requires that the tag GENERATE_QHP is set to YES.

QHP_SECT_FILTER_ATTRS  =

# The QHG_LOCATION tag can be used to specify the location of Qt's
# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
# generated .qhp file.
# This tag requires that the tag GENERATE_QHP is set to YES.

QHG_LOCATION           =

# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
# generated, together with the HTML files, they form an Eclipse help plugin. To
# install this plugin and make it available under the help contents menu in
# Eclipse, the contents of the directory containing the HTML and XML files needs
# to be copied into the plugins directory of eclipse. The name of the directory
# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
# After copying Eclipse needs to be restarted before the help appears.
# The default value is: NO.
# This tag requires that the tag GENERATE_HTML is set to YES.

GENERATE_ECLIPSEHELP   = NO

# A unique identifier for the Eclipse help plugin. When installing the plugin
# the directory name containing the HTML and XML files should also have this
# name. Each documentation set should have its own identifier.
# The default value is: org.doxygen.Project.
# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.

ECLIPSE_DOC_ID         = org.doxygen.Project

# If you want full control over the layout of the generated HTML pages it might
# be necessary to disable the index and replace it with your own. The
# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
# of each HTML page. A value of NO enables the index and the value YES disables
# it. Since the tabs in the index contain the same information as the navigation
# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
# The default value is: NO.
# This tag requires that the tag GENERATE_HTML is set to YES.

DISABLE_INDEX          = NO

# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
# structure should be generated to display hierarchical information. If the tag
# value is set to YES, a side panel will be generated containing a tree-like
# index structure (just like the one that is generated for HTML Help). For this
# to work a browser that supports JavaScript, DHTML, CSS and frames is required
# (i.e. any modern browser). Windows users are probably better off using the
# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
# further fine-tune the look of the index. As an example, the default style
# sheet generated by doxygen has an example that shows how to put an image at
# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
# the same information as the tab index, you could consider setting
# DISABLE_INDEX to YES when enabling this option.
# The default value is: NO.
# This tag requires that the tag GENERATE_HTML is set to YES.

GENERATE_TREEVIEW      = YES

# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
# doxygen will group on one line in the generated HTML documentation.
#
# Note that a value of 0 will completely suppress the enum values from appearing
# in the overview section.
# Minimum value: 0, maximum value: 20, default value: 4.
# This tag requires that the tag GENERATE_HTML is set to YES.

ENUM_VALUES_PER_LINE   = 4

# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
# to set the initial width (in pixels) of the frame in which the tree is shown.
# Minimum value: 0, maximum value: 1500, default value: 250.
# This tag requires that the tag GENERATE_HTML is set to YES.

TREEVIEW_WIDTH         = 250

# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
# external symbols imported via tag files in a separate window.
# The default value is: NO.
# This tag requires that the tag GENERATE_HTML is set to YES.

EXT_LINKS_IN_WINDOW    = NO

# Use this tag to change the font size of LaTeX formulas included as images in
# the HTML documentation. When you change the font size after a successful
# doxygen run you need to manually remove any form_*.png images from the HTML
# output directory to force them to be regenerated.
# Minimum value: 8, maximum value: 50, default value: 10.
# This tag requires that the tag GENERATE_HTML is set to YES.

FORMULA_FONTSIZE       = 10

# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
# generated for formulas are transparent PNGs. Transparent PNGs are not
# supported properly for IE 6.0, but are supported on all modern browsers.
#
# Note that when changing this option you need to delete any form_*.png files in
# the HTML output directory before the changes have effect.
# The default value is: YES.
# This tag requires that the tag GENERATE_HTML is set to YES.

FORMULA_TRANSPARENT    = YES

# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
# https://www.mathjax.org) which uses client side Javascript for the rendering
# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
# installed or if you want to formulas look prettier in the HTML output. When
# enabled you may also need to install MathJax separately and configure the path
# to it using the MATHJAX_RELPATH option.
# The default value is: NO.
# This tag requires that the tag GENERATE_HTML is set to YES.

USE_MATHJAX            = YES

# When MathJax is enabled you can set the default output format to be used for
# the MathJax output. See the MathJax site (see:
# http://docs.mathjax.org/en/latest/output.html) for more details.
# Possible values are: HTML-CSS (which is slower, but has the best
# compatibility), NativeMML (i.e. MathML) and SVG.
# The default value is: HTML-CSS.
# This tag requires that the tag USE_MATHJAX is set to YES.

MATHJAX_FORMAT         = HTML-CSS

# When MathJax is enabled you need to specify the location relative to the HTML
# output directory using the MATHJAX_RELPATH option. The destination directory
# should contain the MathJax.js script. For instance, if the mathjax directory
# is located at the same level as the HTML output directory, then
# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
# Content Delivery Network so you can quickly see the result without installing
# MathJax. However, it is strongly recommended to install a local copy of
# MathJax from https://www.mathjax.org before deployment.
# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/.
# This tag requires that the tag USE_MATHJAX is set to YES.

MATHJAX_RELPATH        = https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/

# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
# extension names that should be enabled during MathJax rendering. For example
# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
# This tag requires that the tag USE_MATHJAX is set to YES.

MATHJAX_EXTENSIONS     =

# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
# of code that will be used on startup of the MathJax code. See the MathJax site
# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
# example see the documentation.
# This tag requires that the tag USE_MATHJAX is set to YES.

MATHJAX_CODEFILE       =

# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
# the HTML output. The underlying search engine uses javascript and DHTML and
# should work on any modern browser. Note that when using HTML help
# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
# there is already a search function so this one should typically be disabled.
# For large projects the javascript based search engine can be slow, then
# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
# search using the keyboard; to jump to the search box use <access key> + S
# (what the <access key> is depends on the OS and browser, but it is typically
# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
# key> to jump into the search results window, the results can be navigated
# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
# the search. The filter options can be selected when the cursor is inside the
# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
# to select a filter and <Enter> or <escape> to activate or cancel the filter
# option.
# The default value is: YES.
# This tag requires that the tag GENERATE_HTML is set to YES.

SEARCHENGINE           = YES

# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
# implemented using a web server instead of a web client using Javascript. There
# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
# setting. When disabled, doxygen will generate a PHP script for searching and
# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
# and searching needs to be provided by external tools. See the section
# "External Indexing and Searching" for details.
# The default value is: NO.
# This tag requires that the tag SEARCHENGINE is set to YES.

SERVER_BASED_SEARCH    = NO

# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
# script for searching. Instead the search results are written to an XML file
# which needs to be processed by an external indexer. Doxygen will invoke an
# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
# search results.
#
# Doxygen ships with an example indexer (doxyindexer) and search engine
# (doxysearch.cgi) which are based on the open source search engine library
# Xapian (see: https://xapian.org/).
#
# See the section "External Indexing and Searching" for details.
# The default value is: NO.
# This tag requires that the tag SEARCHENGINE is set to YES.

EXTERNAL_SEARCH        = NO

# The SEARCHENGINE_URL should point to a search engine hosted by a web server
# which will return the search results when EXTERNAL_SEARCH is enabled.
#
# Doxygen ships with an example indexer (doxyindexer) and search engine
# (doxysearch.cgi) which are based on the open source search engine library
# Xapian (see: https://xapian.org/). See the section "External Indexing and
# Searching" for details.
# This tag requires that the tag SEARCHENGINE is set to YES.

SEARCHENGINE_URL       =

# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
# search data is written to a file for indexing by an external tool. With the
# SEARCHDATA_FILE tag the name of this file can be specified.
# The default file is: searchdata.xml.
# This tag requires that the tag SEARCHENGINE is set to YES.

SEARCHDATA_FILE        = searchdata.xml

# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
# projects and redirect the results back to the right project.
# This tag requires that the tag SEARCHENGINE is set to YES.

EXTERNAL_SEARCH_ID     =

# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
# projects other than the one defined by this configuration file, but that are
# all added to the same external search index. Each project needs to have a
# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
# to a relative location where the documentation can be found. The format is:
# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
# This tag requires that the tag SEARCHENGINE is set to YES.

EXTRA_SEARCH_MAPPINGS  =

#---------------------------------------------------------------------------
# Configuration options related to the LaTeX output
#---------------------------------------------------------------------------

# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
# The default value is: YES.

GENERATE_LATEX         = NO

# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
# it.
# The default directory is: latex.
# This tag requires that the tag GENERATE_LATEX is set to YES.

LATEX_OUTPUT           = latex

# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
# invoked.
#
# Note that when not enabling USE_PDFLATEX the default is latex when enabling
# USE_PDFLATEX the default is pdflatex and when in the later case latex is
# chosen this is overwritten by pdflatex. For specific output languages the
# default can have been set differently, this depends on the implementation of
# the output language.
# This tag requires that the tag GENERATE_LATEX is set to YES.

LATEX_CMD_NAME         =

# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
# index for LaTeX.
# Note: This tag is used in the Makefile / make.bat.
# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file
# (.tex).
# The default file is: makeindex.
# This tag requires that the tag GENERATE_LATEX is set to YES.

MAKEINDEX_CMD_NAME     = makeindex

# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to
# generate index for LaTeX. In case there is no backslash (\) as first character
# it will be automatically added in the LaTeX code.
# Note: This tag is used in the generated output file (.tex).
# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat.
# The default value is: makeindex.
# This tag requires that the tag GENERATE_LATEX is set to YES.

LATEX_MAKEINDEX_CMD    = makeindex

# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
# documents. This may be useful for small projects and may help to save some
# trees in general.
# The default value is: NO.
# This tag requires that the tag GENERATE_LATEX is set to YES.

COMPACT_LATEX          = NO

# The PAPER_TYPE tag can be used to set the paper type that is used by the
# printer.
# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
# 14 inches) and executive (7.25 x 10.5 inches).
# The default value is: a4.
# This tag requires that the tag GENERATE_LATEX is set to YES.

PAPER_TYPE             = a4

# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
# that should be included in the LaTeX output. The package can be specified just
# by its name or with the correct syntax as to be used with the LaTeX
# \usepackage command. To get the times font for instance you can specify :
# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
# To use the option intlimits with the amsmath package you can specify:
# EXTRA_PACKAGES=[intlimits]{amsmath}
# If left blank no extra packages will be included.
# This tag requires that the tag GENERATE_LATEX is set to YES.

EXTRA_PACKAGES         =

# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
# generated LaTeX document. The header should contain everything until the first
# chapter. If it is left blank doxygen will generate a standard header. See
# section "Doxygen usage" for information on how to let doxygen write the
# default header to a separate file.
#
# Note: Only use a user-defined header if you know what you are doing! The
# following commands have a special meaning inside the header: $title,
# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
# string, for the replacement values of the other commands the user is referred
# to HTML_HEADER.
# This tag requires that the tag GENERATE_LATEX is set to YES.

LATEX_HEADER           =

# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
# generated LaTeX document. The footer should contain everything after the last
# chapter. If it is left blank doxygen will generate a standard footer. See
# LATEX_HEADER for more information on how to generate a default footer and what
# special commands can be used inside the footer.
#
# Note: Only use a user-defined footer if you know what you are doing!
# This tag requires that the tag GENERATE_LATEX is set to YES.

LATEX_FOOTER           =

# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
# LaTeX style sheets that are included after the standard style sheets created
# by doxygen. Using this option one can overrule certain style aspects. Doxygen
# will copy the style sheet files to the output directory.
# Note: The order of the extra style sheet files is of importance (e.g. the last
# style sheet in the list overrules the setting of the previous ones in the
# list).
# This tag requires that the tag GENERATE_LATEX is set to YES.

LATEX_EXTRA_STYLESHEET =

# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
# other source files which should be copied to the LATEX_OUTPUT output
# directory. Note that the files will be copied as-is; there are no commands or
# markers available.
# This tag requires that the tag GENERATE_LATEX is set to YES.

LATEX_EXTRA_FILES      =

# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
# contain links (just like the HTML output) instead of page references. This
# makes the output suitable for online browsing using a PDF viewer.
# The default value is: YES.
# This tag requires that the tag GENERATE_LATEX is set to YES.

PDF_HYPERLINKS         = YES

# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
# the PDF file directly from the LaTeX files. Set this option to YES, to get a
# higher quality PDF documentation.
# The default value is: YES.
# This tag requires that the tag GENERATE_LATEX is set to YES.

USE_PDFLATEX           = YES

# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
# command to the generated LaTeX files. This will instruct LaTeX to keep running
# if errors occur, instead of asking the user for help. This option is also used
# when generating formulas in HTML.
# The default value is: NO.
# This tag requires that the tag GENERATE_LATEX is set to YES.

LATEX_BATCHMODE        = NO

# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
# index chapters (such as File Index, Compound Index, etc.) in the output.
# The default value is: NO.
# This tag requires that the tag GENERATE_LATEX is set to YES.

LATEX_HIDE_INDICES     = NO

# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
# code with syntax highlighting in the LaTeX output.
#
# Note that which sources are shown also depends on other settings such as
# SOURCE_BROWSER.
# The default value is: NO.
# This tag requires that the tag GENERATE_LATEX is set to YES.

LATEX_SOURCE_CODE      = NO

# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
# bibliography, e.g. plainnat, or ieeetr. See
# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
# The default value is: plain.
# This tag requires that the tag GENERATE_LATEX is set to YES.

LATEX_BIB_STYLE        = plain

# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
# page will contain the date and time when the page was generated. Setting this
# to NO can help when comparing the output of multiple runs.
# The default value is: NO.
# This tag requires that the tag GENERATE_LATEX is set to YES.

LATEX_TIMESTAMP        = NO

# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
# path from which the emoji images will be read. If a relative path is entered,
# it will be relative to the LATEX_OUTPUT directory. If left blank the
# LATEX_OUTPUT directory will be used.
# This tag requires that the tag GENERATE_LATEX is set to YES.

LATEX_EMOJI_DIRECTORY  =

#---------------------------------------------------------------------------
# Configuration options related to the RTF output
#---------------------------------------------------------------------------

# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
# RTF output is optimized for Word 97 and may not look too pretty with other RTF
# readers/editors.
# The default value is: NO.

GENERATE_RTF           = NO

# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
# it.
# The default directory is: rtf.
# This tag requires that the tag GENERATE_RTF is set to YES.

RTF_OUTPUT             = rtf

# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
# documents. This may be useful for small projects and may help to save some
# trees in general.
# The default value is: NO.
# This tag requires that the tag GENERATE_RTF is set to YES.

COMPACT_RTF            = NO

# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
# contain hyperlink fields. The RTF file will contain links (just like the HTML
# output) instead of page references. This makes the output suitable for online
# browsing using Word or some other Word compatible readers that support those
# fields.
#
# Note: WordPad (write) and others do not support links.
# The default value is: NO.
# This tag requires that the tag GENERATE_RTF is set to YES.

RTF_HYPERLINKS         = NO

# Load stylesheet definitions from file. Syntax is similar to doxygen's
# configuration file, i.e. a series of assignments. You only have to provide
# replacements, missing definitions are set to their default value.
#
# See also section "Doxygen usage" for information on how to generate the
# default style sheet that doxygen normally uses.
# This tag requires that the tag GENERATE_RTF is set to YES.

RTF_STYLESHEET_FILE    =

# Set optional variables used in the generation of an RTF document. Syntax is
# similar to doxygen's configuration file. A template extensions file can be
# generated using doxygen -e rtf extensionFile.
# This tag requires that the tag GENERATE_RTF is set to YES.

RTF_EXTENSIONS_FILE    =

# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
# with syntax highlighting in the RTF output.
#
# Note that which sources are shown also depends on other settings such as
# SOURCE_BROWSER.
# The default value is: NO.
# This tag requires that the tag GENERATE_RTF is set to YES.

RTF_SOURCE_CODE        = NO

#---------------------------------------------------------------------------
# Configuration options related to the man page output
#---------------------------------------------------------------------------

# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
# classes and files.
# The default value is: NO.

GENERATE_MAN           = NO

# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
# it. A directory man3 will be created inside the directory specified by
# MAN_OUTPUT.
# The default directory is: man.
# This tag requires that the tag GENERATE_MAN is set to YES.

MAN_OUTPUT             = man

# The MAN_EXTENSION tag determines the extension that is added to the generated
# man pages. In case the manual section does not start with a number, the number
# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
# optional.
# The default value is: .3.
# This tag requires that the tag GENERATE_MAN is set to YES.

MAN_EXTENSION          = .3

# The MAN_SUBDIR tag determines the name of the directory created within
# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
# MAN_EXTENSION with the initial . removed.
# This tag requires that the tag GENERATE_MAN is set to YES.

MAN_SUBDIR             =

# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
# will generate one additional man file for each entity documented in the real
# man page(s). These additional files only source the real man page, but without
# them the man command would be unable to find the correct page.
# The default value is: NO.
# This tag requires that the tag GENERATE_MAN is set to YES.

MAN_LINKS              = NO

#---------------------------------------------------------------------------
# Configuration options related to the XML output
#---------------------------------------------------------------------------

# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
# captures the structure of the code including all documentation.
# The default value is: NO.

GENERATE_XML           = NO

# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
# it.
# The default directory is: xml.
# This tag requires that the tag GENERATE_XML is set to YES.

XML_OUTPUT             = xml

# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
# listings (including syntax highlighting and cross-referencing information) to
# the XML output. Note that enabling this will significantly increase the size
# of the XML output.
# The default value is: YES.
# This tag requires that the tag GENERATE_XML is set to YES.

XML_PROGRAMLISTING     = YES

# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include
# namespace members in file scope as well, matching the HTML output.
# The default value is: NO.
# This tag requires that the tag GENERATE_XML is set to YES.

XML_NS_MEMB_FILE_SCOPE = NO

#---------------------------------------------------------------------------
# Configuration options related to the DOCBOOK output
#---------------------------------------------------------------------------

# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
# that can be used to generate PDF.
# The default value is: NO.

GENERATE_DOCBOOK       = NO

# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
# front of it.
# The default directory is: docbook.
# This tag requires that the tag GENERATE_DOCBOOK is set to YES.

DOCBOOK_OUTPUT         = docbook

# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
# program listings (including syntax highlighting and cross-referencing
# information) to the DOCBOOK output. Note that enabling this will significantly
# increase the size of the DOCBOOK output.
# The default value is: NO.
# This tag requires that the tag GENERATE_DOCBOOK is set to YES.

DOCBOOK_PROGRAMLISTING = NO

#---------------------------------------------------------------------------
# Configuration options for the AutoGen Definitions output
#---------------------------------------------------------------------------

# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
# the structure of the code including all documentation. Note that this feature
# is still experimental and incomplete at the moment.
# The default value is: NO.

GENERATE_AUTOGEN_DEF   = NO

#---------------------------------------------------------------------------
# Configuration options related to the Perl module output
#---------------------------------------------------------------------------

# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
# file that captures the structure of the code including all documentation.
#
# Note that this feature is still experimental and incomplete at the moment.
# The default value is: NO.

GENERATE_PERLMOD       = NO

# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
# output from the Perl module output.
# The default value is: NO.
# This tag requires that the tag GENERATE_PERLMOD is set to YES.

PERLMOD_LATEX          = NO

# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
# formatted so it can be parsed by a human reader. This is useful if you want to
# understand what is going on. On the other hand, if this tag is set to NO, the
# size of the Perl module output will be much smaller and Perl will parse it
# just the same.
# The default value is: YES.
# This tag requires that the tag GENERATE_PERLMOD is set to YES.

PERLMOD_PRETTY         = YES

# The names of the make variables in the generated doxyrules.make file are
# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
# so different doxyrules.make files included by the same Makefile don't
# overwrite each other's variables.
# This tag requires that the tag GENERATE_PERLMOD is set to YES.

PERLMOD_MAKEVAR_PREFIX =

#---------------------------------------------------------------------------
# Configuration options related to the preprocessor
#---------------------------------------------------------------------------

# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
# C-preprocessor directives found in the sources and include files.
# The default value is: YES.

ENABLE_PREPROCESSING   = YES

# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
# in the source code. If set to NO, only conditional compilation will be
# performed. Macro expansion can be done in a controlled way by setting
# EXPAND_ONLY_PREDEF to YES.
# The default value is: NO.
# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.

MACRO_EXPANSION        = NO

# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
# the macro expansion is limited to the macros specified with the PREDEFINED and
# EXPAND_AS_DEFINED tags.
# The default value is: NO.
# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.

EXPAND_ONLY_PREDEF     = NO

# If the SEARCH_INCLUDES tag is set to YES, the include files in the
# INCLUDE_PATH will be searched if a #include is found.
# The default value is: YES.
# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.

SEARCH_INCLUDES        = YES

# The INCLUDE_PATH tag can be used to specify one or more directories that
# contain include files that are not input files but should be processed by the
# preprocessor.
# This tag requires that the tag SEARCH_INCLUDES is set to YES.

INCLUDE_PATH           =

# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
# patterns (like *.h and *.hpp) to filter out the header-files in the
# directories. If left blank, the patterns specified with FILE_PATTERNS will be
# used.
# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.

INCLUDE_FILE_PATTERNS  =

# The PREDEFINED tag can be used to specify one or more macro names that are
# defined before the preprocessor is started (similar to the -D option of e.g.
# gcc). The argument of the tag is a list of macros of the form: name or
# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
# is assumed. To prevent a macro definition from being undefined via #undef or
# recursively expanded use the := operator instead of the = operator.
# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.

PREDEFINED             =

# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
# tag can be used to specify a list of macro names that should be expanded. The
# macro definition that is found in the sources will be used. Use the PREDEFINED
# tag if you want to use a different macro definition that overrules the
# definition found in the source code.
# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.

EXPAND_AS_DEFINED      =

# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
# remove all references to function-like macros that are alone on a line, have
# an all uppercase name, and do not end with a semicolon. Such function macros
# are typically used for boiler-plate code, and will confuse the parser if not
# removed.
# The default value is: YES.
# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.

SKIP_FUNCTION_MACROS   = YES

#---------------------------------------------------------------------------
# Configuration options related to external references
#---------------------------------------------------------------------------

# The TAGFILES tag can be used to specify one or more tag files. For each tag
# file the location of the external documentation should be added. The format of
# a tag file without this location is as follows:
# TAGFILES = file1 file2 ...
# Adding location for the tag files is done as follows:
# TAGFILES = file1=loc1 "file2 = loc2" ...
# where loc1 and loc2 can be relative or absolute paths or URLs. See the
# section "Linking to external documentation" for more information about the use
# of tag files.
# Note: Each tag file must have a unique name (where the name does NOT include
# the path). If a tag file is not located in the directory in which doxygen is
# run, you must also specify the path to the tagfile here.

TAGFILES               =
# TAGFILES              += "cppreference-doxygen-web.tag.xml=http://en.cppreference.com/w/"

# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
# tag file that is based on the input files it reads. See section "Linking to
# external documentation" for more information about the usage of tag files.

GENERATE_TAGFILE       =

# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
# the class index. If set to NO, only the inherited external classes will be
# listed.
# The default value is: NO.

ALLEXTERNALS           = NO

# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
# in the modules index. If set to NO, only the current project's groups will be
# listed.
# The default value is: YES.

EXTERNAL_GROUPS        = YES

# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
# the related pages index. If set to NO, only the current project's pages will
# be listed.
# The default value is: YES.

EXTERNAL_PAGES         = YES

# The PERL_PATH should be the absolute path and name of the perl script
# interpreter (i.e. the result of 'which perl').
# The default file (with absolute path) is: /usr/bin/perl.

PERL_PATH              = /usr/bin/perl

#---------------------------------------------------------------------------
# Configuration options related to the dot tool
#---------------------------------------------------------------------------

# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
# NO turns the diagrams off. Note that this option also works with HAVE_DOT
# disabled, but it is recommended to install and use dot, since it yields more
# powerful graphs.
# The default value is: YES.

CLASS_DIAGRAMS         = YES

# You can define message sequence charts within doxygen comments using the \msc
# command. Doxygen will then run the mscgen tool (see:
# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
# documentation. The MSCGEN_PATH tag allows you to specify the directory where
# the mscgen tool resides. If left empty the tool is assumed to be found in the
# default search path.

MSCGEN_PATH            =

# You can include diagrams made with dia in doxygen documentation. Doxygen will
# then run dia to produce the diagram and insert it in the documentation. The
# DIA_PATH tag allows you to specify the directory where the dia binary resides.
# If left empty dia is assumed to be found in the default search path.

DIA_PATH               =

# If set to YES the inheritance and collaboration graphs will hide inheritance
# and usage relations if the target is undocumented or is not a class.
# The default value is: YES.

HIDE_UNDOC_RELATIONS   = YES

# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
# available from the path. This tool is part of Graphviz (see:
# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
# Bell Labs. The other options in this section have no effect if this option is
# set to NO
# The default value is: NO.

HAVE_DOT               = NO

# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
# to run in parallel. When set to 0 doxygen will base this on the number of
# processors available in the system. You can set it explicitly to a value
# larger than 0 to get control over the balance between CPU load and processing
# speed.
# Minimum value: 0, maximum value: 32, default value: 0.
# This tag requires that the tag HAVE_DOT is set to YES.

DOT_NUM_THREADS        = 0

# When you want a differently looking font in the dot files that doxygen
# generates you can specify the font name using DOT_FONTNAME. You need to make
# sure dot is able to find the font, which can be done by putting it in a
# standard location or by setting the DOTFONTPATH environment variable or by
# setting DOT_FONTPATH to the directory containing the font.
# The default value is: Helvetica.
# This tag requires that the tag HAVE_DOT is set to YES.

DOT_FONTNAME           = Helvetica

# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
# dot graphs.
# Minimum value: 4, maximum value: 24, default value: 10.
# This tag requires that the tag HAVE_DOT is set to YES.

DOT_FONTSIZE           = 10

# By default doxygen will tell dot to use the default font as specified with
# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
# the path where dot can find it using this tag.
# This tag requires that the tag HAVE_DOT is set to YES.

DOT_FONTPATH           =

# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
# each documented class showing the direct and indirect inheritance relations.
# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
# The default value is: YES.
# This tag requires that the tag HAVE_DOT is set to YES.

CLASS_GRAPH            = YES

# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
# graph for each documented class showing the direct and indirect implementation
# dependencies (inheritance, containment, and class references variables) of the
# class with other documented classes.
# The default value is: YES.
# This tag requires that the tag HAVE_DOT is set to YES.

COLLABORATION_GRAPH    = YES

# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
# groups, showing the direct groups dependencies.
# The default value is: YES.
# This tag requires that the tag HAVE_DOT is set to YES.

GROUP_GRAPHS           = YES

# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
# collaboration diagrams in a style similar to the OMG's Unified Modeling
# Language.
# The default value is: NO.
# This tag requires that the tag HAVE_DOT is set to YES.

UML_LOOK               = NO

# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
# class node. If there are many fields or methods and many nodes the graph may
# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
# number of items for each type to make the size more manageable. Set this to 0
# for no limit. Note that the threshold may be exceeded by 50% before the limit
# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
# but if the number exceeds 15, the total amount of fields shown is limited to
# 10.
# Minimum value: 0, maximum value: 100, default value: 10.
# This tag requires that the tag HAVE_DOT is set to YES.

UML_LIMIT_NUM_FIELDS   = 10

# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
# collaboration graphs will show the relations between templates and their
# instances.
# The default value is: NO.
# This tag requires that the tag HAVE_DOT is set to YES.

TEMPLATE_RELATIONS     = NO

# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
# YES then doxygen will generate a graph for each documented file showing the
# direct and indirect include dependencies of the file with other documented
# files.
# The default value is: YES.
# This tag requires that the tag HAVE_DOT is set to YES.

INCLUDE_GRAPH          = YES

# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
# set to YES then doxygen will generate a graph for each documented file showing
# the direct and indirect include dependencies of the file with other documented
# files.
# The default value is: YES.
# This tag requires that the tag HAVE_DOT is set to YES.

INCLUDED_BY_GRAPH      = YES

# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
# dependency graph for every global function or class method.
#
# Note that enabling this option will significantly increase the time of a run.
# So in most cases it will be better to enable call graphs for selected
# functions only using the \callgraph command. Disabling a call graph can be
# accomplished by means of the command \hidecallgraph.
# The default value is: NO.
# This tag requires that the tag HAVE_DOT is set to YES.

CALL_GRAPH             = NO

# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
# dependency graph for every global function or class method.
#
# Note that enabling this option will significantly increase the time of a run.
# So in most cases it will be better to enable caller graphs for selected
# functions only using the \callergraph command. Disabling a caller graph can be
# accomplished by means of the command \hidecallergraph.
# The default value is: NO.
# This tag requires that the tag HAVE_DOT is set to YES.

CALLER_GRAPH           = NO

# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
# hierarchy of all classes instead of a textual one.
# The default value is: YES.
# This tag requires that the tag HAVE_DOT is set to YES.

GRAPHICAL_HIERARCHY    = YES

# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
# dependencies a directory has on other directories in a graphical way. The
# dependency relations are determined by the #include relations between the
# files in the directories.
# The default value is: YES.
# This tag requires that the tag HAVE_DOT is set to YES.

DIRECTORY_GRAPH        = YES

# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
# generated by dot. For an explanation of the image formats see the section
# output formats in the documentation of the dot tool (Graphviz (see:
# http://www.graphviz.org/)).
# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
# to make the SVG files visible in IE 9+ (other browsers do not have this
# requirement).
# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
# png:gdiplus:gdiplus.
# The default value is: png.
# This tag requires that the tag HAVE_DOT is set to YES.

DOT_IMAGE_FORMAT       = png

# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
# enable generation of interactive SVG images that allow zooming and panning.
#
# Note that this requires a modern browser other than Internet Explorer. Tested
# and working are Firefox, Chrome, Safari, and Opera.
# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
# the SVG files visible. Older versions of IE do not have SVG support.
# The default value is: NO.
# This tag requires that the tag HAVE_DOT is set to YES.

INTERACTIVE_SVG        = NO

# The DOT_PATH tag can be used to specify the path where the dot tool can be
# found. If left blank, it is assumed the dot tool can be found in the path.
# This tag requires that the tag HAVE_DOT is set to YES.

DOT_PATH               =

# The DOTFILE_DIRS tag can be used to specify one or more directories that
# contain dot files that are included in the documentation (see the \dotfile
# command).
# This tag requires that the tag HAVE_DOT is set to YES.

DOTFILE_DIRS           =

# The MSCFILE_DIRS tag can be used to specify one or more directories that
# contain msc files that are included in the documentation (see the \mscfile
# command).

MSCFILE_DIRS           =

# The DIAFILE_DIRS tag can be used to specify one or more directories that
# contain dia files that are included in the documentation (see the \diafile
# command).

DIAFILE_DIRS           =

# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
# path where java can find the plantuml.jar file. If left blank, it is assumed
# PlantUML is not used or called during a preprocessing step. Doxygen will
# generate a warning when it encounters a \startuml command in this case and
# will not generate output for the diagram.

PLANTUML_JAR_PATH      =

# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
# configuration file for plantuml.

PLANTUML_CFG_FILE      =

# When using plantuml, the specified paths are searched for files specified by
# the !include statement in a plantuml block.

PLANTUML_INCLUDE_PATH  =

# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
# that will be shown in the graph. If the number of nodes in a graph becomes
# larger than this value, doxygen will truncate the graph, which is visualized
# by representing a node as a red box. Note that doxygen if the number of direct
# children of the root node in a graph is already larger than
# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
# Minimum value: 0, maximum value: 10000, default value: 50.
# This tag requires that the tag HAVE_DOT is set to YES.

DOT_GRAPH_MAX_NODES    = 50

# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
# generated by dot. A depth value of 3 means that only nodes reachable from the
# root by following a path via at most 3 edges will be shown. Nodes that lay
# further from the root node will be omitted. Note that setting this option to 1
# or 2 may greatly reduce the computation time needed for large code bases. Also
# note that the size of a graph can be further restricted by
# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
# Minimum value: 0, maximum value: 1000, default value: 0.
# This tag requires that the tag HAVE_DOT is set to YES.

MAX_DOT_GRAPH_DEPTH    = 0

# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
# background. This is disabled by default, because dot on Windows does not seem
# to support this out of the box.
#
# Warning: Depending on the platform used, enabling this option may lead to
# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
# read).
# The default value is: NO.
# This tag requires that the tag HAVE_DOT is set to YES.

DOT_TRANSPARENT        = NO

# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
# files in one run (i.e. multiple -o and -T options on the command line). This
# makes dot run faster, but since only newer versions of dot (>1.8.10) support
# this, this feature is disabled by default.
# The default value is: NO.
# This tag requires that the tag HAVE_DOT is set to YES.

DOT_MULTI_TARGETS      = NO

# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
# explaining the meaning of the various boxes and arrows in the dot generated
# graphs.
# The default value is: YES.
# This tag requires that the tag HAVE_DOT is set to YES.

GENERATE_LEGEND        = YES

# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
# files that are used to generate the various graphs.
# The default value is: YES.
# This tag requires that the tag HAVE_DOT is set to YES.

DOT_CLEANUP            = YES


================================================
FILE: docs/DoxygenLayout.xml
================================================
<doxygenlayout version="1.0">
  <!-- Generated by doxygen 1.8.15 -->
  <!-- adapted to doxygen 1.8.13 (Katta) -->
  <!-- Navigation index tabs for HTML output -->
  <navindex>
    <tab type="mainpage" visible="yes" title="Overview"/>
    <!-- <tab type="pages" visible="yes" title="Tutorials" intro=""/> -->
    <tab type="modules" visible="yes" title="Components" intro=""/>
    <tab type="namespaces" visible="yes" title="">
      <tab type="namespacelist" visible="yes" title="" intro=""/>
      <tab type="namespacemembers" visible="yes" title="" intro=""/>
    </tab>
    <tab type="classes" visible="yes" title="">
      <tab type="classlist" visible="yes" title="" intro=""/>
      <tab type="classindex" visible="$ALPHABETICAL_INDEX" title=""/>
      <tab type="hierarchy" visible="yes" title="" intro=""/>
      <tab type="classmembers" visible="yes" title="" intro=""/>
    </tab>
    <tab type="files" visible="yes" title="">
      <tab type="filelist" visible="yes" title="" intro=""/>
      <tab type="globals" visible="yes" title="" intro=""/>
    </tab>
    <tab type="examples" visible="yes" title="" intro=""/>
  </navindex>

  <!-- Layout definition for a class page -->
  <class>
    <briefdescription visible="yes"/>
    <detaileddescription title=""/>
    <includes visible="$SHOW_INCLUDE_FILES"/>
    <inheritancegraph visible="$CLASS_GRAPH"/>
    <collaborationgraph visible="$COLLABORATION_GRAPH"/>
    <memberdecl>
      <nestedclasses visible="yes" title=""/>
      <publictypes title=""/>
      <services title=""/>
      <interfaces title=""/>
      <publicslots title=""/>
      <signals title=""/>
      <publicmethods title=""/>
      <publicstaticmethods title=""/>
      <publicattributes title=""/>
      <publicstaticattributes title=""/>
      <protectedtypes title=""/>
      <protectedslots title=""/>
      <protectedmethods title=""/>
      <protectedstaticmethods title=""/>
      <protectedattributes title=""/>
      <protectedstaticattributes title=""/>
      <packagetypes title=""/>
      <packagemethods title=""/>
      <packagestaticmethods title=""/>
      <packageattributes title=""/>
      <packagestaticattributes title=""/>
      <properties title=""/>
      <events title=""/>
      <privatetypes title=""/>
      <privateslots title=""/>
      <privatemethods title=""/>
      <privatestaticmethods title=""/>
      <privateattributes title=""/>
      <privatestaticattributes title=""/>
      <friends title=""/>
      <related title="" subtitle=""/>
      <membergroups visible="yes"/>
    </memberdecl>
    <memberdef>
      <inlineclasses title=""/>
      <typedefs title=""/>
      <enums title=""/>
      <services title=""/>
      <interfaces title=""/>
      <constructors title=""/>
      <functions title=""/>
      <related title=""/>
      <variables title=""/>
      <properties title=""/>
      <events title=""/>
    </memberdef>
    <allmemberslink visible="yes"/>
    <usedfiles visible="$SHOW_USED_FILES"/>
    <authorsection visible="yes"/>
  </class>

  <!-- Layout definition for a namespace page -->
  <namespace>
    <briefdescription visible="yes"/>
    <detaileddescription title=""/>
    <memberdecl>
      <nestednamespaces visible="yes" title=""/>
      <constantgroups visible="yes" title=""/>
      <classes visible="yes" title=""/>
      <typedefs title=""/>
      <enums title=""/>
      <functions title=""/>
      <variables title=""/>
      <membergroups visible="yes"/>
    </memberdecl>
    <memberdef>
      <inlineclasses title=""/>
      <typedefs title=""/>
      <enums title=""/>
      <functions title=""/>
      <variables title=""/>
    </memberdef>
    <authorsection visible="yes"/>
  </namespace>

  <!-- Layout definition for a file page -->
  <file>
    <briefdescription visible="yes"/>
    <detaileddescription title=""/>
    <includes visible="$SHOW_INCLUDE_FILES"/>
    <includegraph visible="$INCLUDE_GRAPH"/>
    <includedbygraph visible="$INCLUDED_BY_GRAPH"/>
    <sourcelink visible="yes"/>
    <memberdecl>
      <classes visible="yes" title=""/>
      <namespaces visible="yes" title=""/>
      <constantgroups visible="yes" title=""/>
      <defines title=""/>
      <typedefs title=""/>
      <enums title=""/>
      <functions title=""/>
      <variables title=""/>
      <membergroups visible="yes"/>
    </memberdecl>
    <memberdef>
      <inlineclasses title=""/>
      <defines title=""/>
      <typedefs title=""/>
      <enums title=""/>
      <functions title=""/>
      <variables title=""/>
    </memberdef>
    <authorsection/>
  </file>

  <!-- Layout definition for a group page -->
  <group>
    <briefdescription visible="yes"/>
    <detaileddescription title=""/>
    <groupgraph visible="$GROUP_GRAPHS"/>
    <memberdecl>
      <nestedgroups visible="yes" title=""/>
      <dirs visible="yes" title=""/>
      <files visible="yes" title=""/>
      <namespaces visible="yes" title=""/>
      <classes visible="yes" title=""/>
      <defines title=""/>
      <typedefs title=""/>
      <enums title=""/>
      <enumvalues title=""/>
      <functions title=""/>
      <variables title=""/>
      <signals title=""/>
      <publicslots title=""/>
      <protectedslots title=""/>
      <privateslots title=""/>
      <events title=""/>
      <properties title=""/>
      <friends title=""/>
      <membergroups visible="yes"/>
    </memberdecl>
    <memberdef>
      <pagedocs/>
      <inlineclasses title=""/>
      <defines title=""/>
      <typedefs title=""/>
      <enums title=""/>
      <enumvalues title=""/>
      <functions title=""/>
      <variables title=""/>
      <signals title=""/>
      <publicslots title=""/>
      <protectedslots title=""/>
      <privateslots title=""/>
      <events title=""/>
      <properties title=""/>
      <friends title=""/>
    </memberdef>
    <authorsection visible="yes"/>
  </group>

  <!-- Layout definition for a directory page -->
  <directory>
    <briefdescription visible="yes"/>
    <detaileddescription title=""/>
    <directorygraph visible="yes"/>
    <memberdecl>
      <dirs visible="yes"/>
      <files visible="yes"/>
    </memberdecl>
  </directory>
</doxygenlayout>


================================================
FILE: docs/README.md
================================================
## CoreNEURON Documentation

### Local build

It is recommended using a `virtualenv`, for example:

```
pip3 install virtualenv
python3 -m virtualenv venv
source venv/bin/activate
```

In order to build documentation locally, you need to pip install the [docs_requirements](docs_requirements.txt) :
```
pip3 install --user -r docs/docs_requirements.txt --upgrade
```

Then in your CMake build folder:
```
make docs
```  
That will build everything in the `build/docs` folder and you can then open `index.html` locally.

When working locally on documentation, be aware of the following targets to speed up building process:

* `doxygen` - build the API documentation only
* `sphinx` - build Sphinx documentation


================================================
FILE: docs/_static/custom.css
================================================
.wy-nav-content {
    max-width: 1000px;
    margin-right: auto;
}

#notebook-container {
    width: inherit;
}


================================================
FILE: docs/conda_environment.yml
================================================
name: base
channels:
  - conda-forge
  - defaults
dependencies:
  - bison
  - cmake
  - doxygen


================================================
FILE: docs/conf.py
================================================
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html

# -- Path setup --------------------------------------------------------------

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))


# -- Project information -----------------------------------------------------

project = 'CoreNEURON'
copyright = 'Duke, Yale, and the BlueBrain Project -- Copyright 1984-2020'
author = 'Michael Hines and the BlueBrain Project'


# -- General configuration ---------------------------------------------------

# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
    'sphinx.ext.autodoc',
    'sphinx.ext.autosummary',
    'sphinx.ext.autosectionlabel',
    'recommonmark',
    'sphinx.ext.mathjax'
]

source_suffix = {
    '.rst': 'restructuredtext',
    '.txt': 'markdown',
    '.md': 'markdown',
}

# Add any paths that contain templates here, relative to this directory.
# templates_path = ['_templates']

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'python/venv']


# -- Options for HTML output -------------------------------------------------

# The theme to use for HTML and HTML Help pages.  See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'

# Sphinx expects the master doc to be contents
master_doc = 'index'

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']

html_css_files = [
    'custom.css',
]

nbsphinx_allow_errors = True

import os
if os.environ.get("READTHEDOCS"):
    os.system("rm -rf BUILD && mkdir BUILD && cd BUILD && cmake -DCORENRN_ENABLE_MPI=OFF ../.. && make doxygen")
    html_extra_path = ['BUILD/docs']


================================================
FILE: docs/docs_requirements.txt
================================================
sphinx
sphinx_rtd_theme
recommonmark

================================================
FILE: docs/doxygen.rst
================================================
C++ API
===========

Link to doxygen `C++ API`_ 

.. _C++ API: doxygen/index.html


================================================
FILE: docs/footer.html
================================================
<!-- HTML footer for doxygen 1.8.15-->
<!-- start footer part -->
<!--BEGIN GENERATE_TREEVIEW-->
<div id="nav-path" class="navpath">
  <ul>
    $navpath
  </ul>
</div>
<hr class="footer"/>
<address class="footer">
    <small>
    </small>
</address>
<!--END !GENERATE_TREEVIEW-->
</body>
</html>


================================================
FILE: docs/index.rst
================================================
Welcome to CoreNEURON's documentation!
==================================

.. toctree::
   :maxdepth: 2
   :caption: User documentation:

   userdoc/BinaryFormat/BinaryFormat.md
   userdoc/MemoryManagement/bbcorepointer.md

.. toctree::
   :maxdepth: 2
   :caption: Developer documentation:

   doxygen

Indices and tables
==================

* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`


================================================
FILE: docs/userdoc/BinaryFormat/BinaryFormat.md
================================================
## CoreNEURON Input Binary File Format

NEURON is used for building in-memory model of the network. The in-memory representation of model is then dumped to binary files and read by CoreNEURON. The abstract structure of these binary files is shown : ![Binary File Format](binary_file_format.jpg).

> Note : additional datasets are being added for additional functionality (e.g. Gap Junctions). This dcoumentation / format will be updated in the future.


================================================
FILE: docs/userdoc/MemoryManagement/bbcorepointer.md
================================================

## Transferring dynamically allocated data between NEURON and CoreNEURON


User-allocated data can be managed in NMODL using the `POINTER` type. It allows the
programmer to reference data that has been allocated in HOC or in VERBATIM blocks. This
allows for more advanced data-structures that are not natively supported in NMODL.

Since NEURON itself has no knowledge of the layout and size of this data it cannot
transfer `POINTER` data automatically to CoreNEURON. Furtheremore, in many cases there
is no need to transfer the data between the two instances. In some cases, however, the
programmer would like to transfer certain user-defined data into CoreNEURON. The most
prominent example are random123 RNG stream parameters used in synapse mechanisms. To
support this use-case the `BBCOREPOINTER` type was introduced. Variables that are declared as
`BBCOREPOINTER` behave exactly the same as `POINTER` but are additionally taken into account
when NEURON is serializing mechanism data (for file writing or direct-memory transfer).
For NEURON to be able to write (and indeed CoreNEURON to be able to read) `BBCOREPOINTER`
data, the programmer has to additionally provide two C functions that are called as part
of the serialization/deserialization.

```
static void bbcore_write(double* x, int* d, int* d_offset, int* x_offset, _threadargsproto_);

static void bbcore_read(double* x, int* d, int* d_offset, int* x_offset, _threadargsproto_);
```

The implementation of `bbcore_write` and `bbcore_read` determines the serialization and
deserialization of the per-instance mechanism data referenced through the various
`BBCOREPOINTER`s.

NEURON will call `bbcore_write` twice per mechanism instance. In a first sweep, the call is used to
determine the required memory to be allocated on the serialization arrays. In the second sweep the
call is used to fill in the data per mechanism instance.

The functions take following arguments

* `x`: A `double` type array that will be allocated by NEURON to fill with real-valued data. In the
  first call, `x` is NULL as it has not been allocated yet.
* `d`: An `int` type array that will be allocated by NEURON to fill with integer-valued data. In the
  first call, `d` is NULL as it has not been allocated yet.
* `x_offset`: The offset in `x` at which the mechanism instance should write its real-valued
  `BBCOREPOINTER` data. In the first call this is an output argument that is expected to be updated
  by the per-instance size to be allocated.
* `d_offset`: The offset in `x` at which the mechanism instance should write its integer-valued
  `BBCOREPOINTER` data. In the first call this is an output argument that is expected to be updated
  by the per-instance size to be allocated.
* `_threadargsproto_`: a macro placeholder for NEURON/CoreNEURON data-structure parameters. They
  are typically only used through generated defines and not by the programmer. The macro is defined
  as follows:

```
#define _threadargsproto_                                                                         \
    int _iml, int _cntml_padded, double *_p, Datum *_ppvar, ThreadDatum *_thread, NrnThread *_nt, \
    double _v
```

Putting all of this together, the following is a minimal MOD using BBCOREPOINTER:

```
TITLE A BBCOREPOINTER Example 

NEURON {
    BBCOREPOINTER my_data
}

ASSIGNED {
    my_data
}

: Do something interesting with my_data ...

VERBATIM
static void bbcore_write(double* x, int* d, int* x_offset, int* d_offset, _threadargsproto_) {
    if (x) {
        double* x_i = x + *x_offset;
        x_i[0] = _p_my_data[0];
        x_i[1] = _p_my_data[1];
    }
    *x_offset += 2; // reserve 2 doubles on serialization buffer x
}

static void bbcore_read(double* x, int* d, int* x_offset, int* d_offset, _threadargsproto_) {
    assert(!_p_my_data);
    double* x_i = x + *x_offset;
    // my_data needs to be allocated somehow
    _p_my_data = (double*)malloc(sizeof(double)*2); 
    _p_my_data[0] = x_i[0];
    _p_my_data[1] = x_i[1];
    *x_offset += 2;
}
ENDVERBATIM
```


================================================
FILE: extra/CMakeLists.txt
================================================
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================

# =============================================================================
# Copy first into build directory as it will be used for special-core
# =============================================================================
configure_file(nrnivmodl_core_makefile.in
               ${CMAKE_BINARY_DIR}/share/coreneuron/nrnivmodl_core_makefile @ONLY)
configure_file(nrnivmodl-core.in ${CMAKE_BINARY_DIR}/bin/nrnivmodl-core @ONLY)
# nrnivmodl-core depends on the building of NMODL_TARGET_TO_DEPEND and the configuration of the
# nrnivmodl-core and nrnivmodl_core_makefile this doesn't imply that whenever there is a change in
# one of those files then the prebuilt mod files are going to be rebuilt
add_custom_target(
  nrnivmodl-core ALL
  DEPENDS ${CMAKE_BINARY_DIR}/bin/nrnivmodl-core
          ${CMAKE_BINARY_DIR}/share/coreneuron/nrnivmodl_core_makefile ${NMODL_TARGET_TO_DEPEND})

# =============================================================================
# Install for end users
# =============================================================================
install(FILES ${CMAKE_BINARY_DIR}/share/coreneuron/nrnivmodl_core_makefile
        DESTINATION share/coreneuron)
install(PROGRAMS ${CMAKE_BINARY_DIR}/bin/nrnivmodl-core DESTINATION bin)


================================================
FILE: extra/instrumentation.tau
================================================
BEGIN_INCLUDE_LIST
 double nrnmpi_dbl_allreduce(double, int)
 int coreneuron::main(int, char **, char **)
 int coreneuron::nrnmpi_bgp_conserve(int, int)
 int coreneuron::nrnmpi_bgp_single_advance(NRNMPI_Spike *)
 int coreneuron::nrnmpi_spike_exchange(int*, NRNMPI_Spike*)
 int main(int, char **, char **)
 size_t nrnbbcore_write()
 void coreneuron::*nrn_fixed_step_group_thread(coreneuron::NrnThread *)
 void coreneuron::*nrn_fixed_step_lastpart(coreneuron::NrnThread *)
 void coreneuron::*nrn_fixed_step_thread(coreneuron::NrnThread *)
 void coreneuron::*nrn_ms_bksub(coreneuron::NrnThread *)
 void coreneuron::*nrn_ms_bksub_through_triang(coreneuron::NrnThread *)
 void coreneuron::*nrn_ms_reduce_solve(coreneuron::NrnThread *)
 void coreneuron::*nrn_ms_treeset_through_triang(coreneuron::NrnThread *)
 void coreneuron::*setup_tree_matrix(coreneuron::NrnThread *)
 void coreneuron::*setup_tree_matrix_minimal(coreneuron::NrnThread *)
 void coreneuron::BBS::netpar_solve(double)
 void coreneuron::BBS_netpar_solve(double)
 void coreneuron::NetParEvent::deliver(double, NetCvode *, coreneuron::NrnThread *)
 void coreneuron::NetParEvent::send(double, NetCvode *, coreneuron::NrnThread *)
 void coreneuron::_nrn_cur#(coreneuron::NrnThread *, coreneuron::Memb_list *, int)
 void coreneuron::_nrn_jacob#(coreneuron::NrnThread *, coreneuron::Memb_list *, int)
 void coreneuron::_nrn_state#(coreneuron::NrnThread *, coreneuron::Memb_list *, int)
 void coreneuron::all_wait_for_spike_exchange()
 void coreneuron::bksub(coreneuron::NrnThread *)
 void coreneuron::deliver_net_events(coreneuron::NrnThread *)
 void coreneuron::determine_inputpresyn()
 void coreneuron::finitialize(void)
 void coreneuron::ncs2nrn_integrate(double)
 void coreneuron::nonvint(coreneuron::NrnThread *)
 void coreneuron::nrn2ncs_outputevent(int, double)
 void coreneuron::nrn_cap_jacob(coreneuron::NrnThread *, Memb_list *)
 void coreneuron::nrn_cleanup_presyn(PreSyn *)
 void coreneuron::nrn_deliver_events(coreneuron::NrnThread *)
 void coreneuron::nrn_finitialize(int, double)
 void coreneuron::nrn_fixed_step_group(int)
 void coreneuron::nrn_fixed_step_group_minimal(int)
 void coreneuron::nrn_fixed_single_steps_minimal(int, double)
 void coreneuron::nrn_flush_reports(double)
 void coreneuron::nrn_lhs(coreneuron::NrnThread *)
 void coreneuron::nrn_multithread_job(void *(*)(coreneuron::NrnThread *))
 void coreneuron::nrn_promote()
 void coreneuron::nrn_rhs(coreneuron::NrnThread *)
 void coreneuron::nrn_setup(const char *, const char *, int, int)
 void coreneuron::nrn_solve(coreneuron::NrnThread *)
 void coreneuron::nrn_solve_minimal(coreneuron::NrnThread *)
 void coreneuron::nrn_spike_exchange(coreneuron::NrnThread *)
 void coreneuron::nrn_spike_exchange_init()
 void coreneuron::nrnmpi_barrier()
 void coreneuron::nrnmpi_bgp_multisend(NRNMPI_Spike *, int, int *)
 void coreneuron::nrnmpi_int_gather(int *, int *, int, int)
 void coreneuron::nrnmpi_int_gatherv(int *, int, int *, int *, int *, int)
 void coreneuron::nrnmpi_postrecv_doubles(double *, int, int, int, void **)
 void coreneuron::nrnmpi_send_doubles(double *, int, int, int)
 void coreneuron::nrnmpi_spike_initialize()
 void coreneuron::nrnmpi_wait(void **)
 void coreneuron::output_spikes(const char *)
 void coreneuron::output_spikes_parallel(const char *)
 void coreneuron::read_phase1(data_reader &, coreneuron::NrnThread &)
 void coreneuron::read_phase2(data_reader &, coreneuron::NrnThread &)
 void coreneuron::setup_report_engine(double, double)
 void coreneuron::solve_interleaved1(int)
 void coreneuron::triang(coreneuron::NrnThread *)
 void coreneuron::triang_interleaved(coreneuron::NrnThread *, int, int, int, int *, int *)
 void coreneuron::update(coreneuron::NrnThread *)
 void coreneuron::write_checkpoint(coreneuron::NrnThread *, int, const char *, bool)
 void coreneuron::write_checkpoint(coreneuron::NrnThread *, int, const char*, bool)
 void coreneuron::write_nrnthread(const char *, coreneuron::NrnThread &, nrncore_CellGroup &)
 void coreneuron::write_nrnthread_task(const char *, nrncore_CellGroup *)
END_INCLUDE_LIST


================================================
FILE: extra/nrnivmodl-core.in
================================================
#!/bin/bash

# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================

set -e

# TODO : mod2c_core can be linked with (HPE-)MPI library
# and running that under slurm allocation result into
# runtime error. For now, unset PMI_RANK variable
# which is sufficint to avoid issue with HPE-MPI+SLURM.
unset PMI_RANK

# name of the script
APP_NAME="$(basename "$0")"

# directory and parent directory of this script
PARENT_DIR="$(dirname "$BASH_SOURCE")/.."

# prefer perl exe set by neuron wrappers in case of wheel
PERL_EXE="${CORENRN_PERLEXE:-@PERL_EXECUTABLE@}"
# in case of mac installer, wrapper is not used and hence
# check if binary exist. otherwise, just rely on perl being
# in default $PATH
if [ ! -f "${PERL_EXE}" ]; then PERL_EXE="$(which perl)"; fi

ROOT_DIR="$("${PERL_EXE}" -e "use Cwd 'abs_path'; print abs_path('$PARENT_DIR')")"

# default arguments : number of parallel builds and default mod file path
PARALLEL_BUILDS=4
params_MODS_PATH="."
params_BUILD_TYPE="@COMPILE_LIBRARY_TYPE@"
params_NRN_PRCELLSTATE="@CORENRN_NRN_PRCELLSTATE@"

# prefix for common options : make sure to rename these if options are changed.
MAKE_OPTIONS="MECHLIB_SUFFIX MOD2CPP_BINARY MOD2CPP_RUNTIME_FLAGS DESTDIR INCFLAGS LINKFLAGS MODS_PATH VERBOSE BUILD_TYPE NRN_PRCELLSTATE"

# parse CLI args
while getopts "n:m:a:d:i:l:Vp:r:b:h" OPT; do
    case "$OPT" in
    n)
        # suffix for mechanism library
        params_MECHLIB_SUFFIX="$OPTARG";;
    m)
        # nmodl or mod2c binary to use
        params_MOD2CPP_BINARY="$OPTARG";;
    a)
        # additional nmodl flags to be used
        params_MOD2CPP_RUNTIME_FLAGS="$OPTARG";;
    d)
        # destination install directory
        params_DESTDIR="$OPTARG";;
    i)
        # extra include flags
        params_INCFLAGS="$OPTARG";;
    l)
        # extra link flags
        params_LINKFLAGS="$OPTARG";;
    V)
        # make with verbose
        params_VERBOSE=1;;
    p)
        # option for parallel build (with -j)
        PARALLEL_BUILDS="$OPTARG";;
    b)
        # make with verbose
        params_BUILD_TYPE="$OPTARG";;
    r)
        # enable NRN_PRCELLSTATE mechanism
        params_NRN_PRCELLSTATE="$OPTARG";;
    h)
        echo "$APP_NAME [options, ...] [mods_path]"
        echo "Options:"
        echo "  -n <name>                 The model name, used as a suffix in the shared library"
        echo "  -m <nmodl_bin>            NMODL/mod2c code generation compiler path"
        echo "  -a <nmodl_runtime_flags>  Runtime flags for NMODL/mod2c"
        echo "  -i <incl_flags>           Definitions passed to the compiler, typically '-I dir..'"
        echo "  -l <link_flags>           Definitions passed to the linker, typically '-Lx -lylib..'"
        echo "  -d <dest_dir>             Install to dest_dir. Default: Off."
        echo "  -r <0|1>                  Enable NRN_PRCELLSTATE mechanism. Default: @CORENRN_NRN_PRCELLSTATE@."
        echo "  -V                        Verbose: show commands executed by make"
        echo "  -p <n_procs>              Number of parallel builds (Default: $PARALLEL_BUILDS)"
        echo "  -b <STATIC|SHARED>        libcorenrnmech library type"
        exit 0;;
    ?)
        exit 1;;
    esac
done

# consume an option
shift $(($OPTIND - 1))

# only one mod files directory is supported in neuron and coreneuron
if [ $# -gt 1 ]; then
    echo "[ERROR] $APP_NAME expects at most one mod dir. See syntax: '$APP_NAME -h' "
    exit 1
fi

# if defined mods dir be in $1
if [ $# -eq 1 ]; then
    params_MODS_PATH="$1"
fi

shopt -s nullglob
# warn if no mod files provided
if [ -d "$params_MODS_PATH" ]; then
    files=( "$params_MODS_PATH"/*.mod )
    if [ ${#files} -eq 0 ]; then
        echo "WARNING: No mod files found in '$(realpath ${params_MODS_PATH})', compiling default ones only!"
    fi
else
    echo "FATAL: Invalid mods directory: '$params_MODS_PATH'"
    exit 1
fi

# temporary directory where mod files will be copied
temp_mod_dir="@CMAKE_HOST_SYSTEM_PROCESSOR@/corenrn/mod2c"
mkdir -p "$temp_mod_dir"

# copy mod files with include files. note that ${ROOT_DIR}/share
# has inbuilt mod files and user provided mod files are in $params_MODS_PATH.
set +e
for mod_dir in "${ROOT_DIR}/share/modfile" "$params_MODS_PATH" ;
do
    # copy mod files and include files
    files=( "$mod_dir/"*.mod "$mod_dir/"*.inc "$mod_dir/"*.h* )
    for f in "${files[@]}";
    do
        # copy mod files only if it's changed (to avoid rebuild)
        target_file_path="$temp_mod_dir/$(basename "$f")"
        if ! diff -q "$f" "$target_file_path" &>/dev/null;  then
            cp "$f" "$target_file_path"
        fi
    done
done
set -e

# use new mod files directory for compilation
params_MODS_PATH="$temp_mod_dir"

# build params to make command
make_params=("ROOT=${ROOT_DIR}")
for param in $MAKE_OPTIONS; do
    var="params_${param}"
    if [ "${!var+x}" ]; then
        make_params+=("$param=${!var}")
    fi
done

# if -d (deploy) provided, call "make install"
if [ "$params_DESTDIR" ]; then
    make_params+=("install")
fi

if [ "$params_VERBOSE" ]; then
    make_params+=("VERBOSE=1")
fi

# run makefile
echo "[INFO] Running: make -j$PARALLEL_BUILDS -f ${ROOT_DIR}/share/coreneuron/nrnivmodl_core_makefile ${make_params[@]}"
make -j$PARALLEL_BUILDS -f "${ROOT_DIR}/share/coreneuron/nrnivmodl_core_makefile" "${make_params[@]}"
echo "[INFO] MOD files built successfully for CoreNEURON"


================================================
FILE: extra/nrnivmodl_core_makefile.in
================================================
# This Makefile has the rules necessary for making the custom version of
# CoreNEURON executable called "special-core" from the provided mod files.
# Mod files are looked up in the MODS_PATH directory.

# Current system OS
OS_NAME := $(shell uname)

# ","" is an argument separator, never as a literal for Makefile rule
COMMA_OP =,

# Default variables for various targets
MECHLIB_SUFFIX =
MODS_PATH = .
OUTPUT_DIR = @CMAKE_HOST_SYSTEM_PROCESSOR@
DESTDIR =
TARGET_LIB_TYPE = $(BUILD_TYPE)

# required for OSX to execute nrnivmodl-core
ifeq ($(origin SDKROOT), undefined)
  export SDKROOT := $(shell xcrun --sdk macosx --show-sdk-path)
endif

# CoreNEURON installation directories
CORENRN_BIN_DIR := $(ROOT)/bin
CORENRN_LIB_DIR := $(ROOT)/lib
CORENRN_INC_DIR := $(ROOT)/include
CORENRN_SHARE_CORENRN_DIR:= $(ROOT)/share/coreneuron
CORENRN_SHARE_MOD2CPP_DIR := $(ROOT)/share/mod2c

# name of the CoreNEURON binary
SPECIAL_EXE  = $(OUTPUT_DIR)/special-core

# Directory where cpp files are generated for each mod file
MOD_TO_CPP_DIR = $(OUTPUT_DIR)/corenrn/mod2c

# Directory where cpp files are compiled
MOD_OBJS_DIR = $(OUTPUT_DIR)/corenrn/build

# Linked libraries gathered by CMake
LDFLAGS = $(LINKFLAGS) @CORENRN_COMMON_LDFLAGS@

# Includes paths gathered by CMake
# coreneuron/utils/randoms goes first because it needs to override the NEURON
# directory in INCFLAGS
INCLUDES = -I$(CORENRN_INC_DIR)/coreneuron/utils/randoms $(INCFLAGS) -I$(CORENRN_INC_DIR)
ifeq (@CORENRN_ENABLE_MPI_DYNAMIC@, OFF)
  INCLUDES += $(if @MPI_CXX_INCLUDE_PATH@, -I$(subst ;, -I,@MPI_CXX_INCLUDE_PATH@),)
endif
INCLUDES += $(if @reportinglib_INCLUDE_DIR@, -I$(subst ;, -I,@reportinglib_INCLUDE_DIR@),)

# CXX is always defined. If the definition comes from default change it
ifeq ($(origin CXX), default)
    CXX = @CMAKE_CXX_COMPILER@
endif

ifeq (@CORENRN_ENABLE_GPU@, ON)
  ifneq ($(shell $(CXX) --version | grep -o nvc++), nvc++)
    $(error GPU wheels are only compatible with the NVIDIA C++ compiler nvc++, but CXX=$(CXX) and --version gives $(shell $(CXX) --version))
  endif
  # nvc++ -dumpversion is simpler, but only available from 22.2
  ifeq ($(findstring nvc++ @CORENRN_NVHPC_MAJOR_MINOR_VERSION@, $(shell $(CXX) --version)),)
    $(error GPU wheels are currently not compatible across NVIDIA HPC SDK versions. You have $(shell $(CXX) -V | grep nvc++) but this wheel was built with @CORENRN_NVHPC_MAJOR_MINOR_VERSION@.)
  endif
endif

# In case of wheel, python and perl exe paths are from the build machine.
# First prefer env variables set by neuron's nrnivmodl wrapper then check
# binary used during build. If they don't exist then simply use python and
# perl as the name of binaries.
CORENRN_PYTHONEXE ?= @PYTHON_EXECUTABLE@
CORENRN_PERLEXE ?= @PERL_EXECUTABLE@
ifeq ($(wildcard $(CORENRN_PYTHONEXE)),)
  CORENRN_PYTHONEXE=python
endif
ifeq ($(wildcard $(CORENRN_PERLEXE)),)
  CORENRN_PERLEXE=perl
endif

CXXFLAGS = @CORENRN_CXX_FLAGS@
CXX_COMPILE_CMD = $(CXX) $(CXXFLAGS) @CMAKE_CXX_COMPILE_OPTIONS_PIC@ $(INCLUDES)
CXX_LINK_EXE_CMD = $(CXX) $(CXXFLAGS) @CMAKE_EXE_LINKER_FLAGS@
CXX_SHARED_LIB_CMD = $(CXX) $(CXXFLAGS) @CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS@ @CMAKE_SHARED_LIBRARY_CXX_FLAGS@ @CMAKE_SHARED_LINKER_FLAGS@

# env variables required for mod2c or nmodl
MOD2CPP_ENV_VAR = @CORENRN_SANITIZER_ENABLE_ENVIRONMENT_STRING@ PYTHONPATH=@CORENRN_NMODL_PYTHONPATH@:${CORENRN_LIB_DIR}/python MODLUNIT=$(CORENRN_SHARE_MOD2CPP_DIR)/nrnunits.lib

# nmodl options
ifeq (@CORENRN_ENABLE_NMODL@, ON)
    ifeq (@CORENRN_ENABLE_GPU@, ON)
        nmodl_arguments_c=@NMODL_ACC_BACKEND_ARGS@ @NMODL_COMMON_ARGS@
    else
        nmodl_arguments_c=@NMODL_CPU_BACKEND_ARGS@ @NMODL_COMMON_ARGS@
    endif
endif

# name of the mechanism library with suffix if provided
COREMECH_LIB_NAME = corenrnmech$(if $(MECHLIB_SUFFIX),_$(MECHLIB_SUFFIX),)
COREMECH_LIB_PATH = $(OUTPUT_DIR)/lib$(COREMECH_LIB_NAME)$(LIB_SUFFIX)

# Various header and C++/Object file
MOD_FUNC_CPP = $(MOD_TO_CPP_DIR)/_mod_func.cpp
MOD_FUNC_OBJ = $(MOD_OBJS_DIR)/_mod_func.o
ENGINEMECH_OBJ = $(MOD_OBJS_DIR)/enginemech.o

# Depending on static/shared build, determine library name and it's suffix
ifeq ($(TARGET_LIB_TYPE), STATIC)
    LIB_SUFFIX = @CMAKE_STATIC_LIBRARY_SUFFIX@
    corenrnmech_lib_target = coremech_lib_static
else
    LIB_SUFFIX = @CMAKE_SHARED_LIBRARY_SUFFIX@
    corenrnmech_lib_target = coremech_lib_shared
endif

# Binary of MOD2C/NMODL depending on CMake option activated
ifeq (@nmodl_FOUND@, TRUE)
    MOD2CPP_BINARY_PATH = $(if $(MOD2CPP_BINARY),$(MOD2CPP_BINARY), @CORENRN_MOD2CPP_BINARY@)
    INCLUDES += -I@CORENRN_MOD2CPP_INCLUDE@
else
    MOD2CPP_BINARY_PATH = $(if $(MOD2CPP_BINARY),$(MOD2CPP_BINARY), $(CORENRN_BIN_DIR)/@nmodl_binary_name@)
endif

# MOD files with full path, without path and names without .mod extension
mod_files_paths = $(sort $(wildcard $(MODS_PATH)/*.mod))
mod_files_names = $(sort $(notdir $(wildcard $(MODS_PATH)/*.mod)))
mod_files_no_ext = $(mod_files_names:.mod=)
mod_files_for_cpp_backend = $(foreach mod_file, $(mod_files_paths), $(addprefix $(MOD_TO_CPP_DIR)/, $(notdir $(mod_file))))

# CPP files and their obkects
mod_cpp_files = $(patsubst %.mod,%.cpp,$(mod_files_for_cpp_backend))
mod_cpp_objs = $(addprefix $(MOD_OBJS_DIR)/,$(addsuffix .o,$(basename $(mod_files_no_ext))))

# We use $ORIGIN (@loader_path in OSX)
ORIGIN_RPATH := $(if $(filter Darwin,$(OS_NAME)),@loader_path,$$ORIGIN)
SONAME_OPTION := -Wl,$(if $(filter Darwin,$(OS_NAME)),-install_name${COMMA_OP}@rpath/,-soname${COMMA_OP})$(notdir ${COREMECH_LIB_PATH})
LIB_RPATH = $(if $(DESTDIR),$(DESTDIR)/lib,$(ORIGIN_RPATH))

# When special-core is installed, it needs to find library in the
# lib folder of install prefix. We use relative path in order it
# to be portable when files are moved (e.g. python wheel)
INSTALL_LIB_RPATH = $(ORIGIN_RPATH)/../lib

# All objects used during build
ALL_OBJS = $(MOD_FUNC_OBJ) $(mod_cpp_objs)

# Colors for pretty printing
C_RESET := \033[0m
C_GREEN := \033[32m

# Default nmodl flags. Override if MOD2CPP_RUNTIME_FLAGS is not empty
ifeq (@CORENRN_ENABLE_NMODL@, ON)
    MOD2CPP_FLAGS_C = $(if $(MOD2CPP_RUNTIME_FLAGS),$(MOD2CPP_RUNTIME_FLAGS),$(nmodl_arguments_c))
endif

$(info Default NMODL flags: @nmodl_arguments_c@)

ifneq ($(MOD2CPP_RUNTIME_FLAGS),)
    $(warning Runtime nmodl flags (they replace the default ones): $(MOD2CPP_RUNTIME_FLAGS))
endif

# ======== MAIN BUILD RULES ============


# main target to build binary
$(SPECIAL_EXE): $(corenrnmech_lib_target)
	@printf " => $(C_GREEN)Binary$(C_RESET) creating $(SPECIAL_EXE)\n"
	$(CXX_LINK_EXE_CMD) -o $(SPECIAL_EXE) $(CORENRN_SHARE_CORENRN_DIR)/coreneuron.cpp \
	  -I$(CORENRN_INC_DIR) $(INCFLAGS) \
	  -L$(OUTPUT_DIR) -l$(COREMECH_LIB_NAME) $(LDFLAGS) \
	  -L$(CORENRN_LIB_DIR) \
	  -Wl,-rpath,'$(LIB_RPATH)' -Wl,-rpath,$(CORENRN_LIB_DIR) -Wl,-rpath,'$(INSTALL_LIB_RPATH)'

$(ENGINEMECH_OBJ): $(CORENRN_SHARE_CORENRN_DIR)/enginemech.cpp | $(MOD_OBJS_DIR)
	$(CXX_COMPILE_CMD) -c -DADDITIONAL_MECHS $(CORENRN_SHARE_CORENRN_DIR)/enginemech.cpp -o $(ENGINEMECH_OBJ)

# build shared library of mechanisms
coremech_lib_shared: $(ALL_OBJS) $(ENGINEMECH_OBJ) build_always
	# extract the object files from libcoreneuron-core.a
	mkdir -p $(MOD_OBJS_DIR)/libcoreneuron-core
	rm -f $(MOD_OBJS_DIR)/libcoreneuron-core/*.o
	# --output is only supported by modern versions of ar
	(cd $(MOD_OBJS_DIR)/libcoreneuron-core && ar x $(CORENRN_LIB_DIR)/libcoreneuron-core.a)
	$(CXX_SHARED_LIB_CMD) $(ENGINEMECH_OBJ) -o ${COREMECH_LIB_PATH} $(ALL_OBJS) \
	  -I$(CORENRN_INC_DIR) $(INCFLAGS) \
	  @CORENEURON_LINKER_START_GROUP@ \
	  $(MOD_OBJS_DIR)/libcoreneuron-core/*.o @CORENEURON_LINKER_END_GROUP@ \
		$(LDFLAGS) ${SONAME_OPTION} \
		-Wl,-rpath,$(CORENRN_LIB_DIR) -L$(CORENRN_LIB_DIR)
	# cleanup
	rm $(MOD_OBJS_DIR)/libcoreneuron-core/*.o

# build static library of mechanisms
coremech_lib_static: $(ALL_OBJS) $(ENGINEMECH_OBJ) build_always
	# make a libcorenrnmech.a by copying libcoreneuron-core.a and then appending
	# the newly compiled objects
	cp $(CORENRN_LIB_DIR)/libcoreneuron-core.a ${COREMECH_LIB_PATH}
	ar r ${COREMECH_LIB_PATH} $(ENGINEMECH_OBJ) $(ALL_OBJS)

# compile cpp files to .o
$(MOD_OBJS_DIR)/%.o: $(MOD_TO_CPP_DIR)/%.cpp | $(MOD_OBJS_DIR)
	$(CXX_COMPILE_CMD) -c $< -o $@ -DNRN_PRCELLSTATE=$(NRN_PRCELLSTATE) @CORENEURON_TRANSLATED_CODE_COMPILE_FLAGS@

# translate MOD files to CPP using mod2c/NMODL
$(mod_cpp_files): $(MOD_TO_CPP_DIR)/%.cpp: $(MODS_PATH)/%.mod | $(MOD_TO_CPP_DIR)
	$(MOD2CPP_ENV_VAR) $(MOD2CPP_BINARY_PATH) $< -o $(MOD_TO_CPP_DIR)/ $(MOD2CPP_FLAGS_C)

# generate mod registration function. Dont overwrite if it's not changed
$(MOD_FUNC_CPP): build_always | $(MOD_TO_CPP_DIR)
	$(CORENRN_PERLEXE) $(CORENRN_SHARE_CORENRN_DIR)/mod_func.c.pl $(mod_files_names) > $(MOD_FUNC_CPP).tmp
	diff -q $(MOD_FUNC_CPP).tmp $(MOD_FUNC_CPP) || \
	mv $(MOD_FUNC_CPP).tmp $(MOD_FUNC_CPP)

# symlink to cpp files provided by coreneuron
$(MOD_TO_CPP_DIR)/%.cpp: $(CORENRN_SHARE_MOD2CPP_DIR)/%.cpp | $(MOD_TO_CPP_DIR)
	ln -s $< $@

# create directories needed
$(MOD_TO_CPP_DIR):
	mkdir -p $(MOD_TO_CPP_DIR)

$(MOD_OBJS_DIR):
	mkdir -p $(MOD_OBJS_DIR)

# install binary and libraries
install: $(SPECIAL_EXE)
	install -d $(DESTDIR)/bin $(DESTDIR)/lib
	install ${COREMECH_LIB_PATH} $(DESTDIR)/lib
	install $(SPECIAL_EXE) $(DESTDIR)/bin

.PHONY: build_always

$(VERBOSE).SILENT:

# delete cpp files if mod2c error, otherwise they are not generated again
.DELETE_ON_ERROR:


================================================
FILE: tests/CMakeLists.txt
================================================
# =============================================================================
# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================

include(TestHelpers)

include_directories(${CORENEURON_PROJECT_SOURCE_DIR} ${CORENEURON_PROJECT_BINARY_DIR}/generated
                    ${Boost_INCLUDE_DIRS})

# Add compiler flags that should apply to all CoreNEURON targets, but which should not leak into
# other included projects.
add_compile_definitions(${CORENRN_COMPILE_DEFS})
add_compile_options(${CORENRN_EXTRA_CXX_FLAGS})
add_link_options(${CORENRN_EXTRA_LINK_FLAGS})

if(NOT Boost_USE_STATIC_LIBS)
  add_definitions(-DBOOST_TEST_DYN_LINK=TRUE)
endif()

set(CMAKE_BUILD_RPATH ${CMAKE_BINARY_DIR}/bin/${CMAKE_HOST_SYSTEM_PROCESSOR})

set(Boost_NO_BOOST_CMAKE TRUE)
# Minimum set by needing the multi-argument version of BOOST_AUTO_TEST_CASE.
find_package(Boost 1.59 QUIET COMPONENTS filesystem system atomic unit_test_framework)

if(Boost_FOUND)
  if(CORENRN_ENABLE_UNIT_TESTS)
    add_library(coreneuron-unit-test INTERFACE)
    target_compile_options(coreneuron-unit-test
                           INTERFACE ${CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS})
    target_include_directories(coreneuron-unit-test SYSTEM INTERFACE ${Boost_INCLUDE_DIRS})
    target_link_libraries(coreneuron-unit-test INTERFACE coreneuron-all)
    add_subdirectory(unit/cmdline_interface)
    add_subdirectory(unit/interleave_info)
    add_subdirectory(unit/alignment)
    add_subdirectory(unit/queueing)
    add_subdirectory(unit/solver)
    # lfp test uses nrnmpi_* wrappers but does not load the dynamic MPI library TODO: re-enable
    # after NEURON and CoreNEURON dynamic MPI are merged
    if(NOT CORENRN_ENABLE_MPI_DYNAMIC)
      add_subdirectory(unit/lfp)
    endif()
  endif()
  message(STATUS "Boost found, unit tests enabled")
else()
  message(STATUS "Boost not found, unit tests disabled")
endif()

add_subdirectory(integration)


================================================
FILE: tests/integration/CMakeLists.txt
================================================
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================

if(CORENRN_ENABLE_MPI_DYNAMIC)
  # ~~~
  # In case of submodule building we don't know the MPI launcher and mpi
  # distribution being used. So for now just skip these tests and rely on
  # neuron to test dynamic mpi mode. For coreneuron build assume are just
  # building single generic mpi library libcorenrn_mpi.<suffix>
  # ~~~
  if(CORENEURON_AS_SUBPROJECT)
    message(STATUS "CoreNEURON integration tests are disabled with dynamic MPI")
    return()
  else()
    set(CORENRN_MPI_LIB_ARG
        "--mpi-lib ${PROJECT_BINARY_DIR}/lib/lib${CORENRN_MPI_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}"
    )
  endif()
endif()

set(COMMON_ARGS "--tstop 100. --celsius 6.3 --mpi ${CORENRN_MPI_LIB_ARG}")
set(MODEL_STATS_ARG "--model-stats")
set(RING_DATASET_DIR "${CMAKE_CURRENT_SOURCE_DIR}/ring")
set(RING_COMMON_ARGS "--datpath ${RING_DATASET_DIR} ${COMMON_ARGS}")
set(RING_GAP_COMMON_ARGS "--datpath ${CMAKE_CURRENT_SOURCE_DIR}/ring_gap ${COMMON_ARGS}")
set(PERMUTE1_ARGS "--cell-permute 1")
set(PERMUTE2_ARGS "--cell-permute 2")
set(CUDA_INTERFACE "--cuda-interface")
if(CORENRN_ENABLE_GPU)
  set(GPU_ARGS "--gpu")
  set(permutation_modes 1 2)
else()
  set(permutation_modes 0 1)
endif()

# List of tests with arguments
set(TEST_CASES_WITH_ARGS
    "ring!${RING_COMMON_ARGS} ${MODEL_STATS_ARG} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring"
    "ring_binqueue!${RING_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_binqueue --binqueue"
    "ring_multisend!${RING_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_multisend --multisend"
    "ring_spike_buffer!${RING_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_spike_buffer --spikebuf 1"
    "ring_gap!${RING_GAP_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_gap"
    "ring_gap_binqueue!${RING_GAP_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_gap_binqueue --binqueue"
    "ring_gap_multisend!${RING_GAP_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_gap_multisend --multisend"
)
set(test_suffixes "" "_binqueue" "_multisend")
foreach(cell_permute ${permutation_modes})
  list(APPEND test_suffixes "_permute${cell_permute}")
  list(
    APPEND
    TEST_CASES_WITH_ARGS
    "ring_permute${cell_permute}!${RING_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_permute${cell_permute} --cell-permute=${cell_permute}"
    "ring_gap_permute${cell_permute}!${RING_GAP_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_gap_permute${cell_permute} --cell-permute=${cell_permute}"
  )
  # As reports require MPI, do not add test if report is enabled.
  if(NOT CORENRN_ENABLE_REPORTING)
    list(APPEND test_suffixes "_serial_permute${cell_permute}")
    list(
      APPEND
      TEST_CASES_WITH_ARGS
      "ring_serial_permute${cell_permute}!${GPU_ARGS} --cell-permute=${cell_permute} --tstop 100. --celsius 6.3 --datpath ${RING_DATASET_DIR} ${MODEL_STATS_ARG} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_serial_permute${cell_permute}"
    )
  endif()
endforeach()

if(CORENRN_ENABLE_GPU)
  list(APPEND test_suffixes "_permute2_cudaInterface")
  list(
    APPEND
    TEST_CASES_WITH_ARGS
    "ring_permute2_cudaInterface!${RING_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_permute2_cudaInterface ${PERMUTE2_ARGS} ${CUDA_INTERFACE}"
    "ring_gap_permute2_cudaInterface!${RING_GAP_COMMON_ARGS} ${GPU_ARGS} --outpath ${CMAKE_CURRENT_BINARY_DIR}/ring_gap_permute2_cudaInterface ${PERMUTE2_ARGS} ${CUDA_INTERFACE}"
  )
endif()

# ~~~
# There are no directories for permute and multisend related tests,
# create them and copy reference spikes
# ~~~
foreach(data_dir "ring" "ring_gap")
  # Naïve foreach(test_suffix ${test_suffixes}) does not seem to handle empty suffixes correctly.
  list(LENGTH test_suffixes num_suffixes)
  math(EXPR num_suffixes_m1 "${num_suffixes} - 1")
  foreach(suffix_index RANGE 0 ${num_suffixes_m1})
    list(GET test_suffixes ${suffix_index} test_suffix)
    file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/${data_dir}/out.dat.ref"
         DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/${data_dir}${test_suffix}/")
  endforeach()
endforeach()
# test without ring_gap version
file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/ring/out.dat.ref"
     DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/ring_spike_buffer/")

# names of all tests added
set(CORENRN_TEST_NAMES "")

# Configure test scripts
foreach(args_line ${TEST_CASES_WITH_ARGS})
  string(REPLACE "!" ";" string_line ${args_line})
  set(test_num_processors 1)
  if(MPI_FOUND)
    # serial test run without srun or mpiexec
    if(args_line MATCHES "ring_serial.*")
      string(REPLACE ";" " " SRUN_PREFIX "")
    else()
      set(test_num_processors 2)
      string(REPLACE ";" " " SRUN_PREFIX "${TEST_MPI_EXEC_BIN};-n;${test_num_processors}")
    endif()
  endif()
  list(GET string_line 0 TEST_NAME)
  list(GET string_line 1 TEST_ARGS)
  set(SIM_NAME ${TEST_NAME})
  configure_file(integration_test.sh.in ${TEST_NAME}/integration_test.sh @ONLY)
  add_test(
    NAME ${TEST_NAME}_TEST
    COMMAND "/bin/sh" ${CMAKE_CURRENT_BINARY_DIR}/${TEST_NAME}/integration_test.sh
    WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${TEST_NAME}")
  set_tests_properties(${TEST_NAME}_TEST PROPERTIES PROCESSORS ${test_num_processors})
  cpp_cc_configure_sanitizers(TEST ${TEST_NAME}_TEST)
  list(APPEND CORENRN_TEST_NAMES ${TEST_NAME}_TEST)
endforeach()

if(CORENRN_ENABLE_REPORTING)
  foreach(TEST_NAME "1")
    set(SIM_NAME "reporting_${TEST_NAME}")
    set(CONFIG_ARG "${TEST_NAME}")
    configure_file(reportinglib/${TEST_NAME}.conf.in ${SIM_NAME}/${TEST_NAME}.conf @ONLY)
    configure_file(reportinglib/reporting_test.sh.in ${SIM_NAME}/reporting_test.sh @ONLY)
    configure_file(reportinglib/${TEST_NAME}.check.in ${SIM_NAME}/${TEST_NAME}.check @ONLY)
    file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/reportinglib/test_ref.out" DESTINATION "${SIM_NAME}/")
    add_test(
      NAME ${SIM_NAME}
      COMMAND "/bin/sh" ${CMAKE_CURRENT_BINARY_DIR}/${SIM_NAME}/reporting_test.sh
      WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${SIM_NAME}")
    cpp_cc_configure_sanitizers(TEST ${SIM_NAME})
    list(APPEND CORENRN_TEST_NAMES ${SIM_NAME})
  endforeach()
endif()


================================================
FILE: tests/integration/README.md
================================================
# Generating Tests Input Dataset

There two integration tests under `tests/integration/` directory. The input dataset is generated using NEURON. You can follow below steps for test data generation.

Once you have latest NEURON installed, you have to clone [ringtest](https://github.com/nrnhines/ringtest) model from github:

```bash
git clone https://github.com/nrnhines/ringtest.git
```

You have to create `special` as usual with NEURON:

```bash
nrnivmodl mod
```

Now we can generate data for `ring` test as:


```bash
mpirun -n 2 ./x86_64/special ringtest.py -nring 1 -ncell 20 -tstop 100 -mpi -dumpmodel

# sort spikes and remove old spike output
sortspike spk2.std coredat/out.dat.ref
rm spk2.std
```

The generated dataset can be copied to `tests/integration/ring/`:

```bash
mv coredat/* <external>/coreneuron/tests/integration/ring/
```


Similarly, dataset for `ring_gap` test can be generated as:

```bash
mpirun -n 2 ./x86_64/special ringtest.py -nring 1 -ncell 20 -tstop 100 -gap -mpi -dumpmodel

# sort spikes and remove old spike output
sortspike spk2.std coredat/out.dat.ref
rm spk2.std
mv coredat/* <external>/coreneuron/tests/integration/ring_gap/
```


================================================
FILE: tests/integration/integration_test.sh.in
================================================
#!/usr/bin/env bash
set -e

export OMP_NUM_THREADS=1
export LIBSONATA_ZERO_BASED_GIDS=true

# Run the executable
SRUN_EXTRA=
if [ -n "$VALGRIND" -a -n "$VALGRIND_PRELOAD" ]; then
    echo "Running with valgrind"
    LD_PRELOAD=$VALGRIND_PRELOAD \
    @SRUN_PREFIX@ $SRUN_EXTRA $VALGRIND @CMAKE_BINARY_DIR@/bin/@CMAKE_SYSTEM_PROCESSOR@/special-core @TEST_ARGS@
else
    @SRUN_PREFIX@ $SRUN_EXTRA @CMAKE_BINARY_DIR@/bin/@CMAKE_SYSTEM_PROCESSOR@/special-core @TEST_ARGS@
fi
exitvalue=$?

# Check for error result
if [ $exitvalue -ne 0 ]; then
  echo "Error status value: $exitvalue"
  exit $exitvalue
fi

# diff outputed files with reference
cd @CMAKE_CURRENT_BINARY_DIR@/@SIM_NAME@

# We convert spikes to out.dat format
reports=@ENABLE_SONATA_REPORTS_TESTS@
if [ "$reports" = "ON" ]
then
  data=$(@H5DUMP_EXECUTABLE@ -d /spikes/All/timestamps -d /spikes/All/node_ids -y -O out.h5 | sed 's/"ms"//g;s/,/\n/g')
  echo $data | awk '{n=NF/2; for (i=1;i<=n;i++) print $i "\t" $(n+i) }' > out_SONATA.dat

  if [ ! -f out_SONATA.dat ]
  then
    echo "[ERROR] No SONATA output files. Test failed!" >&2
    exit 1
  fi
  diff -w out_SONATA.dat out.dat.ref > diff_SONATA.dat 2>&1
  if [ -s diff_SONATA.dat ]
  then
    echo "[ERROR] SONATA Results are different, check the file diff_SONATA.dat. Test failed!" >&2
    exit 1
  fi
fi

if [ ! -f out.dat ]
then
  echo "[ERROR] No output files. Test failed!" >&2
  exit 1
fi

diff -w out.dat out.dat.ref > diff.dat 2>&1 || true

if [ -s diff.dat ]
then
  echo "[ERROR] Results are different, check the file diff.dat. Test failed!" >&2
  exit 1
else
  echo "Results are the same, test passed"
  rm -f *.dat
  exit 0
fi


================================================
FILE: tests/integration/reportinglib/1.check.in
================================================
#!/bin/sh

OK=0
FAILED=1
sonata_reports=@ENABLE_SONATA_REPORTS_TESTS@
bin_reports=@ENABLE_BIN_REPORTS_TESTS@
test_ref=@CMAKE_CURRENT_BINARY_DIR@/@SIM_NAME@/test_ref.out

if [ "$bin_reports" = "ON" ]
then
  if [ -f test_1.bbp ]
  then
    somaDump_diff=$(@reportinglib_somaDump@ test_1.bbp 1 | sed 's/ //g' | diff $test_ref -)
    
    if [ $? -ne 0 ]
    then
      echo -e "[ERROR] The report output generated by Reportinglib differs!\n$somaDump_diff" >&2
      exit $FAILED
    fi
  else
     echo "[ERROR] Expected ReportingLib soma file 'test_1.bbp' is missing. Test failed!" >&2
     exit $FAILED
  fi
fi

if [ "$sonata_reports" = "ON" ]
then
  if [ -f test_2.h5 ]
  then
    h5dump_diff=$(@H5DUMP_EXECUTABLE@ -d /report/PopA/data -y -O test_2.h5 | sed '1d;$d;s/,//g;s/ //g' | diff $test_ref -)
    
    if [ $? -ne 0 ]
    then
      echo -e "[ERROR] The report output generated by Libsonata differs!\n$h5dump_diff" >&2
      exit $FAILED
    fi
  else
     echo "[ERROR] Expected SONATA soma file 'test_2.h5' doesn't exist. Test failed!" >&2
     exit $FAILED
  fi
  if [ ! -f spikes.h5 ]
  then
     echo "[ERROR] Expected SONATA spike file 'spikes.h5' doesn't exist. Test failed!" >&2
     exit $FAILED
  fi
fi

# If we reach this point, all tests were successful
exit $OK


================================================
FILE: tests/integration/reportinglib/1.conf.in
================================================
outpath = ./
datpath = @CMAKE_CURRENT_SOURCE_DIR@/ring/
tstop = 10.000000
dt = 0.025000
forwardskip = 0.000000
prcellgid = -1
report-conf = @CMAKE_CURRENT_SOURCE_DIR@/reportinglib/1.report
cell-permute = 0


================================================
FILE: tests/integration/reportinglib/reporting_test.sh.in
================================================
#! /bin/sh

set -e -o pipefail

export OMP_NUM_THREADS=1
export LIBSONATA_ZERO_BASED_GIDS=true

@SRUN_PREFIX@ @CMAKE_BINARY_DIR@/bin/@CMAKE_SYSTEM_PROCESSOR@/special-core --mpi --read-config @CMAKE_CURRENT_BINARY_DIR@/@SIM_NAME@/@TEST_NAME@.conf
chmod +x @CMAKE_CURRENT_BINARY_DIR@/@SIM_NAME@/@TEST_NAME@.check
exit `@CMAKE_CURRENT_BINARY_DIR@/@SIM_NAME@/@TEST_NAME@.check`


================================================
FILE: tests/integration/reportinglib/test_ref.out
================================================
-65
-64.9973
-64.9951
-64.9932
-64.9916
-64.9902
-64.9889
-64.9877
-64.9867
-64.9858
-64.985
-64.9842
-64.9836
-64.9829
-64.9824
-64.9819
-64.9815
-64.9811
-64.9807
-64.9804
-64.9802
-64.9799
-64.9797
-64.9796
-64.9794
-64.9793
-64.9792
-64.9791
-64.979
-64.979
-64.979
-64.979
-64.979
-64.979
-64.979
-64.9791
-64.9791
-64.7371
-63.6264
-62.1068
-60.4682
-58.847
-57.2905
-55.7913
-54.3056
-52.7594
-51.044
-48.9961
-46.3491
-42.6233
-36.8741
-27.1665
-10.1852
13.977
31.4561
36.143
35.2487
32.4239
28.6338
24.2472
19.4933
14.5405
9.51339
4.50006
-0.440951
-5.27461
-9.98373
-14.5648
-19.0258
-23.3868
-27.6838
-31.9759
-36.353
-40.9401
-45.8855
-51.303
-57.1176
-62.8313
-67.5469
-70.6416
-72.2969
-73.0829
-73.4434
-73.6102
-73.6866
-73.7171
-73.7212
-73.7082
-73.6828
-73.6479
-73.6053
-73.5561
-73.5012
-73.4414
-73.3771
-73.3089
-73.237
-73.1618
-73.0836
-73.0025


================================================
FILE: tests/integration/ring/out.dat.ref
================================================
2.65 0
5.3 1
7.95 2
10.6 3
13.25 4
15.9 5
18.55 6
21.2 7
23.85 8
26.5 9
29.15 10
31.8 11
34.45 12
37.1 13
39.75 14
42.4 15
45.05 16
47.7 17
50.35 18
53 19
55.65 0
58.3 1
60.95 2
63.6 3
66.25 4
68.9 5
71.55 6
74.2 7
76.85 8
79.5 9
82.15 10
84.8 11
87.45 12
90.1 13
92.75 14
95.4 15
98.05 16


================================================
FILE: tests/integration/ring_gap/mod files/halfgap.mod
================================================
: ggap.mod
: This is a conductance based gap junction to allow setting g = 0
NEURON {
	POINT_PROCESS HalfGap
	RANGE g, i, vgap
	ELECTRODE_CURRENT i
}
PARAMETER { g = 0 (1/megohm) }
ASSIGNED {
	v (millivolt)
	vgap (millivolt)
	i (nanoamp)
}
BREAKPOINT { i = (vgap - v)*g }


================================================
FILE: tests/integration/ring_gap/out.dat.ref
================================================
3.275 19
4.325 0
4.425 18
5.5 1
5.575 17
6.65 2
6.75 16
7.825 3
7.9 15
8.975 4
9.05 14
10.15 5
10.225 13
11.325 6
11.4 12
12.475 7
12.55 11
13.625 8
13.7 10
14.25 9


================================================
FILE: tests/unit/alignment/CMakeLists.txt
================================================
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
add_executable(alignment_test_bin alignment.cpp)
target_link_libraries(alignment_test_bin coreneuron-unit-test)
add_test(NAME alignment_test COMMAND $<TARGET_FILE:alignment_test_bin>)
cpp_cc_configure_sanitizers(TARGET alignment_test_bin TEST alignment_test)


================================================
FILE: tests/unit/alignment/alignment.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/
#include "coreneuron/utils/memory.h"

#include <boost/mpl/list.hpp>
#define BOOST_TEST_MODULE PaddingCheck
#include <boost/test/included/unit_test.hpp>

#include <cstdint>
#include <cstring>

template <class T, int n = 1>
struct data {
    typedef T value_type;
    static const int chunk = n;
};

typedef boost::mpl::list<data<double>, data<long long int>> chunk_default_data_type;

typedef boost::mpl::list<data<double, 2>,
                         data<double, 4>,
                         data<double, 8>,
                         data<double, 16>,
                         data<double, 32>,
                         data<int, 2>,
                         data<int, 4>,
                         data<int, 8>,
                         data<int, 16>,
                         data<int, 32>>
    chunk_data_type;

BOOST_AUTO_TEST_CASE(padding_simd) {
    /** AOS test */
    int pad = coreneuron::soa_padded_size<1>(11, 1);
    BOOST_CHECK_EQUAL(pad, 11);

    /** SOA tests with 11 */
    pad = coreneuron::soa_padded_size<1>(11, 0);
    BOOST_CHECK_EQUAL(pad, 11);

    pad = coreneuron::soa_padded_size<2>(11, 0);
    BOOST_CHECK_EQUAL(pad, 12);

    pad = coreneuron::soa_padded_size<4>(11, 0);
    BOOST_CHECK_EQUAL(pad, 12);

    pad = coreneuron::soa_padded_size<8>(11, 0);
    BOOST_CHECK_EQUAL(pad, 16);

    pad = coreneuron::soa_padded_size<16>(11, 0);
    BOOST_CHECK_EQUAL(pad, 16);

    pad = coreneuron::soa_padded_size<32>(11, 0);
    BOOST_CHECK_EQUAL(pad, 32);

    /** SOA tests with 32 */
    pad = coreneuron::soa_padded_size<1>(32, 0);
    BOOST_CHECK_EQUAL(pad, 32);

    pad = coreneuron::soa_padded_size<2>(32, 0);
    BOOST_CHECK_EQUAL(pad, 32);

    pad = coreneuron::soa_padded_size<4>(32, 0);
    BOOST_CHECK_EQUAL(pad, 32);

    pad = coreneuron::soa_padded_size<8>(32, 0);
    BOOST_CHECK_EQUAL(pad, 32);

    pad = coreneuron::soa_padded_size<16>(32, 0);
    BOOST_CHECK_EQUAL(pad, 32);

    pad = coreneuron::soa_padded_size<32>(32, 0);
    BOOST_CHECK_EQUAL(pad, 32);

    /** SOA tests with 33 */
    pad = coreneuron::soa_padded_size<1>(33, 0);
    BOOST_CHECK_EQUAL(pad, 33);

    pad = coreneuron::soa_padded_size<2>(33, 0);
    BOOST_CHECK_EQUAL(pad, 34);

    pad = coreneuron::soa_padded_size<4>(33, 0);
    BOOST_CHECK_EQUAL(pad, 36);

    pad = coreneuron::soa_padded_size<8>(33, 0);
    BOOST_CHECK_EQUAL(pad, 40);

    pad = coreneuron::soa_padded_size<16>(33, 0);
    BOOST_CHECK_EQUAL(pad, 48);

    pad = coreneuron::soa_padded_size<32>(33, 0);
    BOOST_CHECK_EQUAL(pad, 64);
}

/// Even number is randomly depends of the TYPE!!! and the number of elements.
/// This test work for 64 bits type not for 32 bits.
BOOST_AUTO_TEST_CASE_TEMPLATE(memory_alignment_simd_false, T, chunk_default_data_type) {
    const int c = T::chunk;
    int total_size_chunk = coreneuron::soa_padded_size<c>(247, 0);
    int ne = 6 * total_size_chunk;

    typename T::value_type* data =
        (typename T::value_type*) coreneuron::ecalloc_align(ne, sizeof(typename T::value_type), 16);

    for (int i = 1; i < 6; i += 2) {
        bool b = coreneuron::is_aligned((data + i * total_size_chunk), 16);
        BOOST_CHECK_EQUAL(b, 0);
    }

    for (int i = 0; i < 6; i += 2) {
        bool b = coreneuron::is_aligned((data + i * total_size_chunk), 16);
        BOOST_CHECK_EQUAL(b, 1);
    }

    free_memory(data);
}

BOOST_AUTO_TEST_CASE_TEMPLATE(memory_alignment_simd_true, T, chunk_data_type) {
    const int c = T::chunk;
    int total_size_chunk = coreneuron::soa_padded_size<c>(247, 0);
    int ne = 6 * total_size_chunk;

    typename T::value_type* data =
        (typename T::value_type*) coreneuron::ecalloc_align(ne, sizeof(typename T::value_type), 16);

    for (int i = 0; i < 6; ++i) {
        bool b = coreneuron::is_aligned((data + i * total_size_chunk), 16);
        BOOST_CHECK_EQUAL(b, 1);
    }

    free_memory(data);
}


================================================
FILE: tests/unit/cmdline_interface/CMakeLists.txt
================================================
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
add_executable(cmd_interface_test_bin test_cmdline_interface.cpp)
target_link_libraries(cmd_interface_test_bin coreneuron-unit-test)
add_test(NAME cmd_interface_test COMMAND $<TARGET_FILE:cmd_interface_test_bin>)
cpp_cc_configure_sanitizers(TARGET cmd_interface_test_bin TEST cmd_interface_test)


================================================
FILE: tests/unit/cmdline_interface/test_cmdline_interface.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/
#include "coreneuron/apps/corenrn_parameters.hpp"

#define BOOST_TEST_MODULE cmdline_interface
#include <boost/test/included/unit_test.hpp>

#include <cfloat>

using namespace coreneuron;

BOOST_AUTO_TEST_CASE(cmdline_interface) {
    const char* argv[] = {

        "nrniv-core",

        "--mpi",

        "--dt",
        "0.02",

        "--tstop",
        "0.1",
#ifdef CORENEURON_ENABLE_GPU
        "--gpu",
#endif
        "--cell-permute",
        "2",

        "--nwarp",
        "8",

        "-d",
        "./",

        "--voltage",
        "-32",

        "--threading",

        "--ms-phases",
        "1",

        "--ms-subintervals",
        "2",

        "--multisend",

        "--spkcompress",
        "32",

        "--binqueue",

        "--spikebuf",
        "100",

        "--prcellgid",
        "12",

        "--forwardskip",
        "0.02",

        "--celsius",
        "25.12",

        "--mindelay",
        "0.1",

        "--dt_io",
        "0.2"};
    constexpr int argc = sizeof argv / sizeof argv[0];

    corenrn_parameters corenrn_param_test;

    corenrn_param_test.parse(argc, const_cast<char**>(argv));  // discarding const as CLI11
                                                               // interface is not const

    BOOST_CHECK(corenrn_param_test.seed == -1);  // testing default value

    BOOST_CHECK(corenrn_param_test.spikebuf == 100);

    BOOST_CHECK(corenrn_param_test.threading == true);

    BOOST_CHECK(corenrn_param_test.dt == 0.02);

    BOOST_CHECK(corenrn_param_test.tstop == 0.1);

    BOOST_CHECK(corenrn_param_test.prcellgid == 12);
#ifdef CORENEURON_ENABLE_GPU
    BOOST_CHECK(corenrn_param_test.gpu == true);
#else
    BOOST_CHECK(corenrn_param_test.gpu == false);
#endif
    BOOST_CHECK(corenrn_param_test.dt_io == 0.2);

    BOOST_CHECK(corenrn_param_test.forwardskip == 0.02);

    BOOST_CHECK(corenrn_param_test.celsius == 25.12);

    BOOST_CHECK(corenrn_param_test.mpi_enable == true);

    BOOST_CHECK(corenrn_param_test.cell_interleave_permute == 2);

    BOOST_CHECK(corenrn_param_test.voltage == -32);

    BOOST_CHECK(corenrn_param_test.nwarp == 8);

    BOOST_CHECK(corenrn_param_test.multisend == true);

    BOOST_CHECK(corenrn_param_test.mindelay == 0.1);

    BOOST_CHECK(corenrn_param_test.ms_phases == 1);

    BOOST_CHECK(corenrn_param_test.ms_subint == 2);

    BOOST_CHECK(corenrn_param_test.spkcompress == 32);

    BOOST_CHECK(corenrn_param_test.multisend == true);

    // Reset all parameters to their default values.
    corenrn_param_test.reset();

    // Should match a default-constructed set of parameters.
    BOOST_CHECK_EQUAL(corenrn_param_test.voltage, corenrn_parameters{}.voltage);

    // Everything has its default value, and the first `false` says not to
    // include default values in the output, so this should be empty
    BOOST_CHECK(corenrn_param_test.config_to_str(false, false).empty());
}


================================================
FILE: tests/unit/interleave_info/CMakeLists.txt
================================================
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
add_executable(interleave_info_bin check_constructors.cpp)
target_link_libraries(interleave_info_bin coreneuron-unit-test)
add_test(NAME interleave_info_constructor_test COMMAND $<TARGET_FILE:interleave_info_bin>)
cpp_cc_configure_sanitizers(TARGET interleave_info_bin TEST interleave_info_constructor_test)


================================================
FILE: tests/unit/interleave_info/check_constructors.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/
#include "coreneuron/permute/cellorder.hpp"

#define BOOST_TEST_MODULE cmdline_interface
#include <boost/test/included/unit_test.hpp>

using namespace coreneuron;

BOOST_AUTO_TEST_CASE(interleave_info_test) {
    size_t nwarp = 4;
    size_t nstride = 6;

    InterleaveInfo info1;

    int data1[] = {11, 37, 45, 2, 18, 37, 7, 39, 66, 33};
    size_t data2[] = {111, 137, 245, 12, 118, 237, 199, 278, 458};

    info1.nwarp = nwarp;
    info1.nstride = nstride;

    // to avoid same values, different sub-array is used to initialize different members
    copy_align_array(info1.stridedispl, data1, nwarp + 1);
    copy_align_array(info1.stride, data1 + 1, nstride);
    copy_align_array(info1.firstnode, data1 + 1, nwarp + 1);
    copy_align_array(info1.lastnode, data1 + 1, nwarp + 1);

    // check if copy_array works
    BOOST_CHECK_NE(info1.firstnode, info1.lastnode);
    BOOST_CHECK_EQUAL_COLLECTIONS(info1.firstnode,
                                  info1.firstnode + nwarp + 1,
                                  info1.lastnode,
                                  info1.lastnode + nwarp + 1);

    copy_align_array(info1.cellsize, data1 + 4, nwarp);
    copy_array(info1.nnode, data2, nwarp);
    copy_array(info1.ncycle, data2 + 1, nwarp);
    copy_array(info1.idle, data2 + 2, nwarp);
    copy_array(info1.cache_access, data2 + 3, nwarp);
    copy_array(info1.child_race, data2 + 4, nwarp);

    // copy constructor
    InterleaveInfo info2(info1);

    // assignment operator
    InterleaveInfo info3;
    info3 = info1;

    std::vector<InterleaveInfo*> infos;

    infos.push_back(&info2);
    infos.push_back(&info3);

    // test few members
    for (size_t i = 0; i < infos.size(); i++) {
        BOOST_CHECK_EQUAL(info1.nwarp, infos[i]->nwarp);
        BOOST_CHECK_EQUAL(info1.nstride, infos[i]->nstride);

        BOOST_CHECK_EQUAL_COLLECTIONS(info1.stridedispl,
                                      info1.stridedispl + nwarp + 1,
                                      infos[i]->stridedispl,
                                      infos[i]->stridedispl + nwarp + 1);

        BOOST_CHECK_EQUAL_COLLECTIONS(info1.stride,
                                      info1.stride + nstride,
                                      infos[i]->stride,
                                      infos[i]->stride + nstride);

        BOOST_CHECK_EQUAL_COLLECTIONS(info1.cellsize,
                                      info1.cellsize + nwarp,
                                      infos[i]->cellsize,
                                      infos[i]->cellsize + nwarp);

        BOOST_CHECK_EQUAL_COLLECTIONS(info1.child_race,
                                      info1.child_race + nwarp,
                                      infos[i]->child_race,
                                      infos[i]->child_race + nwarp);
    }
}


================================================
FILE: tests/unit/lfp/CMakeLists.txt
================================================
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
add_executable(lfp_test_bin lfp.cpp)
target_link_libraries(lfp_test_bin coreneuron-unit-test)
add_test(NAME lfp_test COMMAND $<TARGET_FILE:lfp_test_bin>)
cpp_cc_configure_sanitizers(TARGET lfp_test_bin TEST lfp_test)
set_property(
  TEST lfp_test
  APPEND
  PROPERTY ENVIRONMENT OMP_NUM_THREADS=1)


================================================
FILE: tests/unit/lfp/lfp.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/
#include "coreneuron/io/lfp.hpp"
#include "coreneuron/mpi/nrnmpi.h"

#define BOOST_TEST_MODULE LFPTest
#include <boost/test/included/unit_test.hpp>

#include <iostream>

using namespace coreneuron;
using namespace coreneuron::lfputils;

template <typename F>
double integral(F f, double a, double b, int n) {
    double step = (b - a) / n;  // width of each small rectangle
    double area = 0.0;          // signed area
    for (int i = 0; i < n; i++) {
        area += f(a + (i + 0.5) * step) * step;  // sum up each small rectangle
    }
    return area;
}


BOOST_AUTO_TEST_CASE(LFP_PointSource_LineSource) {
#if NRNMPI
    nrnmpi_init(nullptr, nullptr, false);
#endif
    double segment_length{1.0e-6};
    double segment_start_val{1.0e-6};
    std::array<double, 3> segment_start = std::array<double, 3>{0.0, 0.0, segment_start_val};
    std::array<double, 3> segment_end =
        paxpy(segment_start, 1.0, std::array<double, 3>{0.0, 0.0, segment_length});
    double floor{1.0e-6};
    pi = 3.141592653589;

    std::array<double, 10> vals;
    double circling_radius{1.0e-6};
    std::array<double, 3> segment_middle{0.0, 0.0, 1.5e-6};
    double medium_resistivity_fac{1.0};
    for (auto k = 0; k < 10; k++) {
        std::array<double, 3> approaching_elec =
            paxpy(segment_middle, 1.0, std::array<double, 3>{0.0, 1.0e-5 - k * 1.0e-6, 0.0});
        std::array<double, 3> circling_elec =
            paxpy(segment_middle,
                  1.0,
                  std::array<double, 3>{0.0,
                                        circling_radius * std::cos(2.0 * pi * k / 10),
                                        circling_radius * std::sin(2.0 * pi * k / 10)});

        double analytic_approaching_lfp = line_source_lfp_factor(
            approaching_elec, segment_start, segment_end, floor, medium_resistivity_fac);
        double analytic_circling_lfp = line_source_lfp_factor(
            circling_elec, segment_start, segment_end, floor, medium_resistivity_fac);
        double numeric_circling_lfp = integral(
            [&](double x) {
                return 1.0 / std::max(floor,
                                      norm(paxpy(circling_elec,
                                                 -1.0,
                                                 paxpy(segment_end,
                                                       x,
                                                       paxpy(segment_start, -1.0, segment_end)))));
            },
            0.0,
            1.0,
            10000);
        // TEST of analytic vs numerical integration
        std::clog << "ANALYTIC line source " << analytic_circling_lfp
                  << " vs NUMERIC line source LFP " << numeric_circling_lfp << "\n";
        BOOST_REQUIRE_CLOSE(analytic_circling_lfp, numeric_circling_lfp, 1.0e-6);
        // TEST of LFP Flooring
        BOOST_REQUIRE((approaching_elec[1] < 0.866e-6) ? analytic_approaching_lfp == 1.0e6 : true);
        vals[k] = analytic_circling_lfp;
    }
    // TEST of SYMMETRY of LFP FORMULA
    for (size_t k = 0; k < 5; k++) {
        BOOST_REQUIRE(std::abs((vals[k] - vals[k + 5]) /
                               std::max(std::abs(vals[k]), std::abs(vals[k + 5]))) < 1.0e-12);
    }
    std::vector<std::array<double, 3>> segments_starts = {{0., 0., 1.},
                                                          {0., 0., 0.5},
                                                          {0.0, 0.0, 0.0},
                                                          {0.0, 0.0, -0.5}};
    std::vector<std::array<double, 3>> segments_ends = {{0., 0., 0.},
                                                        {0., 0., 1.},
                                                        {0., 0., 0.5},
                                                        {0.0, 0.0, 0.0}};
    std::vector<double> radii{0.1, 0.1, 0.1, 0.1};
    std::vector<std::array<double, 3>> electrodes = {{0.0, 0.3, 0.0}, {0.0, 0.7, 0.8}};
    std::vector<int> indices = {0, 1, 2, 3};
    LFPCalculator<LineSource> lfp(segments_starts, segments_ends, radii, indices, electrodes, 1.0);
    lfp.template lfp<std::vector<double>>({0.0, 1.0, 2.0, 3.0});
    std::vector<double> res_line_source = lfp.lfp_values();
    LFPCalculator<PointSource> lfpp(
        segments_starts, segments_ends, radii, indices, electrodes, 1.0);
    lfpp.template lfp<std::vector<double>>({0.0, 1.0, 2.0, 3.0});
    std::vector<double> res_point_source = lfpp.lfp_values();
    BOOST_REQUIRE_CLOSE(res_line_source[0], res_point_source[0], 1.0);
    BOOST_REQUIRE_CLOSE(res_line_source[1], res_point_source[1], 1.0);
#if NRNMPI
    nrnmpi_finalize();
#endif
}


================================================
FILE: tests/unit/queueing/CMakeLists.txt
================================================
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
add_executable(queuing_test_bin test_queueing.cpp)
target_link_libraries(queuing_test_bin coreneuron-unit-test)
add_test(NAME queuing_test COMMAND $<TARGET_FILE:queuing_test_bin>)
cpp_cc_configure_sanitizers(TARGET queuing_test_bin TEST queuing_test)


================================================
FILE: tests/unit/queueing/test_queueing.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/
#include "coreneuron/network/netcvode.hpp"
#include "coreneuron/network/tqueue.hpp"

#define BOOST_TEST_MODULE QueueingTest
#include <boost/test/included/unit_test.hpp>

#include <cstdlib>
#include <vector>
#include <iostream>

using namespace coreneuron;
// UNIT TESTS
BOOST_AUTO_TEST_CASE(priority_queue_nq_dq) {
    TQueue<pq_que> tq = TQueue<pq_que>();
    const int num = 8;
    int cnter = 0;
    // enqueue 8 items with increasing time
    for (int i = 0; i < num; ++i)
        tq.insert(static_cast<double>(i), NULL);

    BOOST_CHECK(tq.pq_que_.size() == (num - 1));

    // dequeue items with time <= 5.0. Should be 6 events: from 0. to 5.
    TQItem* item = NULL;
    while ((item = tq.atomic_dq(5.0)) != NULL) {
        ++cnter;
        delete item;
    }
    BOOST_CHECK(cnter == 6);
    BOOST_CHECK(tq.pq_que_.size() == (num - 6 - 1));

    // dequeue the rest
    while ((item = tq.atomic_dq(8.0)) != NULL) {
        ++cnter;
        delete item;
    }

    BOOST_CHECK(cnter == num);
    BOOST_CHECK(tq.pq_que_.empty());
    BOOST_CHECK(tq.least() == NULL);
}

BOOST_AUTO_TEST_CASE(tqueue_ordered_test) {
    TQueue<pq_que> tq = TQueue<pq_que>();
    const int num = 10;
    int cnter = 0;
    double time = double();

    // insert N items with time < N
    for (int i = 0; i < num; ++i) {
        time = static_cast<double>(rand() % num);
        tq.insert(time, NULL);
    }

    time = 0.0;
    TQItem* item = NULL;
    // dequeue all items and check that previous item time <= current item time
    while ((item = tq.atomic_dq(10.0)) != NULL) {
        BOOST_CHECK(time <= item->t_);
        ++cnter;
        time = item->t_;
        delete item;
    }
    BOOST_CHECK(cnter == num);
    BOOST_CHECK(tq.pq_que_.empty());
    BOOST_CHECK(tq.least() == NULL);
}

BOOST_AUTO_TEST_CASE(tqueue_move_nolock) {}

BOOST_AUTO_TEST_CASE(tqueue_remove) {}

BOOST_AUTO_TEST_CASE(threaddata_interthread_send) {
    NetCvodeThreadData nt{};
    const size_t num = 6;
    for (size_t i = 0; i < num; ++i)
        nt.interthread_send(static_cast<double>(i), NULL, NULL);

    BOOST_CHECK(nt.inter_thread_events_.size() == num);
}
/*
BOOST_AUTO_TEST_CASE(threaddata_enqueue){
    NetCvode n = NetCvode();
    const int num = 6;
    for(int i = 0; i < num; ++i)
        n.p[1].interthread_send(static_cast<double>(i), NULL, NULL);

    BOOST_CHECK(n.p[1].inter_thread_events_.size() == num);

    //enqueue the inter_thread_events_
    n.p[1].enqueue(&n, &(n.p[1]));
    BOOST_CHECK(n.p[1].inter_thread_events_.empty());
    BOOST_CHECK(n.p[1].tqe_->pq_que_.size() == num);

    //cleanup priority queue
    TQItem* item = NULL;
    while((item = n.p[1].tqe_->atomic_dq(6.0)) != NULL)
        delete item;
}*/


================================================
FILE: tests/unit/solver/CMakeLists.txt
================================================
# =============================================================================
# Copyright (c) 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================
add_executable(test-solver test_solver.cpp)
target_link_libraries(test-solver coreneuron-unit-test)
add_test(NAME test-solver COMMAND $<TARGET_FILE:test-solver>)
cpp_cc_configure_sanitizers(TARGET test-solver TEST test-solver)


================================================
FILE: tests/unit/solver/test_solver.cpp
================================================
/*
# =============================================================================
# Copyright (c) 2022 Blue Brain Project/EPFL
#
# See top-level LICENSE file for details.
# =============================================================================.
*/
#include "coreneuron/apps/corenrn_parameters.hpp"
#include "coreneuron/gpu/nrn_acc_manager.hpp"
#include "coreneuron/permute/cellorder.hpp"
#include "coreneuron/permute/node_permute.h"
#include "coreneuron/sim/multicore.hpp"

#define BOOST_TEST_MODULE CoreNEURON solver
#include <boost/test/included/unit_test.hpp>

#include <iostream>
#include <functional>
#include <map>
#include <random>
#include <utility>
#include <vector>

using namespace coreneuron;
namespace utf = boost::unit_test;


struct SolverData {
    std::vector<double> d, rhs;
    std::vector<int> parent_index;
};

constexpr auto magic_index_value = -2;
constexpr auto magic_double_value = std::numeric_limits<double>::lowest();

enum struct SolverImplementation {
    CellPermute0_CPU,
    CellPermute0_GPU,
    CellPermute1_CPU,
    CellPermute1_GPU,
    CellPermute2_CPU,
    CellPermute2_GPU,
    CellPermute2_CUDA
};

std::ostream& operator<<(std::ostream& os, SolverImplementation impl) {
    if (impl == SolverImplementation::CellPermute0_CPU) {
        return os << "SolverImplementation::CellPermute0_CPU";
    } else if (impl == SolverImplementation::CellPermute0_GPU) {
        return os << "SolverImplementation::CellPermute0_GPU";
    } else if (impl == SolverImplementation::CellPermute1_CPU) {
        return os << "SolverImplementation::CellPermute1_CPU";
    } else if (impl == SolverImplementation::CellPermute1_GPU) {
        return os << "SolverImplementation::CellPermute1_GPU";
    } else if (impl == SolverImplementation::CellPermute2_CPU) {
        return os << "SolverImplementation::CellPermute2_CPU";
    } else if (impl == SolverImplementation::CellPermute2_GPU) {
        return os << "SolverImplementation::CellPermute2_GPU";
    } else if (impl == SolverImplementation::CellPermute2_CUDA) {
        return os << "SolverImplementation::CellPermute2_CUDA";
    } else {
        throw std::runtime_error("Invalid SolverImplementation");
    }
}

struct ToyModelConfig {
    int num_threads{1};
    int num_cells{1};
    int num_segments_per_cell{3};
    std::function<double(int, int)> produce_a{[](auto, auto) { return 3.14159; }},
        produce_b{[](auto, auto) { return 42.0; }}, produce_d{[](auto, auto) { return 7.0; }},
        produce_rhs{[](auto, auto) { return -16.0; }};
};

// TODO include some global lock as a sanity check (only one instance of
// SetupThreads should exist at any given time)
struct SetupThreads {
    SetupThreads(SolverImplementation impl, ToyModelConfig config = {}) {
        corenrn_param.cuda_interface = false;
        corenrn_param.gpu = false;
        switch (impl) {
            case SolverImplementation::CellPermute0_GPU:
                corenrn_param.gpu = true;
                [[fallthrough]];
            case SolverImplementation::CellPermute0_CPU:
                interleave_permute_type = 0;
                break;
            case SolverImplementation::CellPermute1_GPU:
                corenrn_param.gpu = true;
                [[fallthrough]];
            case SolverImplementation::CellPermute1_CPU:
                interleave_permute_type = 1;
                break;
            case SolverImplementation::CellPermute2_CUDA:
                corenrn_param.cuda_interface = true;
                [[fallthrough]];
            case SolverImplementation::CellPermute2_GPU:
                corenrn_param.gpu = true;
                [[fallthrough]];
            case SolverImplementation::CellPermute2_CPU:
                interleave_permute_type = 2;
                break;
        }
        use_solve_interleave = interleave_permute_type > 0;
        nrn_threads_create(config.num_threads);
        create_interleave_info();
        int num_cells_remaining{config.num_cells}, total_cells{};
        for (auto ithread = 0; ithread < nrn_nthread; ++ithread) {
            auto& nt = nrn_threads[ithread];
            // How many cells to distribute on this thread, trying to get the right
            // total even if num_threads does not exactly divide num_cells.
            nt.ncell = num_cells_remaining / (nrn_nthread - ithread);
            total_cells += nt.ncell;
            num_cells_remaining -= nt.ncell;
            // How many segments are there in this thread?
            nt.end = nt.ncell * config.num_segments_per_cell;
            auto const padded_size = nrn_soa_padded_size(nt.end, 0);
            // Allocate one big block because the GPU data transfer code assumes this.
            nt._ndata = padded_size * 4;
            nt._data = static_cast<double*>(emalloc_align(nt._ndata * sizeof(double)));
            auto* vec_rhs = (nt._actual_rhs = nt._data + 0 * padded_size);
            auto* vec_d = (nt._actual_d = nt._data + 1 * padded_size);
            auto* vec_a = (nt._actual_a = nt._data + 2 * padded_size);
            auto* vec_b = (nt._actual_b = nt._data + 3 * padded_size);
            auto* parent_indices =
                (nt._v_parent_index = static_cast<int*>(emalloc_align(padded_size * sizeof(int))));
            // Magic value to check against later.
            std::fill(parent_indices, parent_indices + nt.end, magic_index_value);
            // Put all the root nodes first, then put the other segments
            // in blocks. i.e. ABCDAAAABBBBCCCCDDDD
            auto const get_index = [ncell = nt.ncell,
                                    nseg = config.num_segments_per_cell](auto icell, auto iseg) {
                if (iseg == 0) {
                    return icell;
                } else {
                    return ncell + icell * (nseg - 1) + iseg - 1;
                }
            };
            for (auto icell = 0; icell < nt.ncell; ++icell) {
                for (auto iseg = 0; iseg < config.num_segments_per_cell; ++iseg) {
                    auto const global_index = get_index(icell, iseg);
                    vec_a[global_index] = config.produce_a(icell, iseg);
                    vec_b[global_index] = config.produce_b(icell, iseg);
                    vec_d[global_index] = config.produce_d(icell, iseg);
                    vec_rhs[global_index] = config.produce_rhs(icell, iseg);
                    // 0th element is the root node, which has no parent
                    // other elements are attached in a binary tree configuration
                    // |      0      |
                    // |    /   \    |
                    // |   1     2   |
                    // |  / \   / \  |
                    // | 3   4 5   6 |
                    // TODO: include some other topologies, e.g. a long straight line, or
                    // an unbalanced tree.
                    auto const parent_id = iseg ? get_index(icell, (iseg - 1) / 2) : -1;
                    parent_indices[global_index] = parent_id;
                }
            }
            // Check we didn't mess up populating any parent indices
            for (auto i = 0; i < nt.end; ++i) {
                BOOST_REQUIRE(parent_indices[i] != magic_index_value);
                // Root nodes should come first for --cell-permute=0
                if (i < nt.ncell) {
                    BOOST_REQUIRE(parent_indices[i] == -1);
                }
            }
            if (interleave_permute_type) {
                nt._permute = interleave_order(nt.id, nt.ncell, nt.end, parent_indices);
                BOOST_REQUIRE(nt._permute);
                permute_data(vec_a, nt.end, nt._permute);
                permute_data(vec_b, nt.end, nt._permute);
                // This isn't done in CoreNEURON because these are reset every
                // time step, but permute d/rhs here so that the initial values
                // set by produce_d and produce_rhs are propagated consistently
                // to all of the solver implementations.
                permute_data(vec_d, nt.end, nt._permute);
                permute_data(vec_rhs, nt.end, nt._permute);
                // index values change as well as ordering
                permute_ptr(parent_indices, nt.end, nt._permute);
                node_permute(parent_indices, nt.end, nt._permute);
            }
        }
        if (impl == SolverImplementation::CellPermute0_GPU) {
            std::cout << "CellPermute0_GPU is a nonstandard configuration, copying data to the "
                         "device may produce warnings:";
        }
        if (corenrn_param.gpu) {
            setup_nrnthreads_on_device(nrn_threads, nrn_nthread);
        }
        if (impl == SolverImplementation::CellPermute0_GPU) {
            std::cout << "\n...no more warnings expected" << std::endl;
        }
        // Make sure we produced the number of cells we were aiming for
        BOOST_REQUIRE(total_cells == config.num_cells);
        BOOST_REQUIRE(num_cells_remaining == 0);
    }

    ~SetupThreads() {
        if (corenrn_param.gpu) {
            delete_nrnthreads_on_device(nrn_threads, nrn_nthread);
        }
        for (auto& nt: *this) {
            free_memory(std::exchange(nt._data, nullptr));
            delete[] std::exchange(nt._permute, nullptr);
            free_memory(std::exchange(nt._v_parent_index, nullptr));
        }
        destroy_interleave_info();
        nrn_threads_free();
    }

    auto dump_solver_data() {
        std::vector<SolverData> ret{static_cast<std::size_t>(nrn_nthread)};
        // Sync the solver data from GPU to host
        update_nrnthreads_on_host(nrn_threads, nrn_nthread);
        // Un-permute the data in and store it in ret.{d,parent_index,rhs}
        for (auto i = 0; i < nrn_nthread; ++i) {
            auto& nt = nrn_threads[i];
            auto& sd = ret[i];
            sd.d.resize(nt.end, magic_double_value);
            sd.parent_index.resize(nt.end, magic_index_value);
            sd.rhs.resize(nt.end, magic_double_value);
            auto* inv_permute = nt._permute ? inverse_permute(nt._permute, nt.end) : nullptr;
            for (auto i = 0; i < nt.end; ++i) {
                // index in permuted vectors
                auto const p_i = nt._permute ? nt._permute[i] : i;
                // parent index in permuted vectors
                auto const p_parent = nt._v_parent_index[p_i];
                // parent index in unpermuted vectors (i.e. on the same scale as `i`)
                auto const parent = p_parent == -1
                                        ? -1
                                        : (inv_permute ? inv_permute[p_parent] : p_parent);
                // Save the values to the de-permuted return structure
                sd.d[i] = nt._actual_d[p_i];
                sd.parent_index[i] = parent;
                sd.rhs[i] = nt._actual_rhs[p_i];
            }
            delete[] inv_permute;
            for (auto i = 0; i < nt.end; ++i) {
                BOOST_REQUIRE(sd.d[i] != magic_double_value);
                BOOST_REQUIRE(sd.parent_index[i] != magic_index_value);
                BOOST_REQUIRE(sd.rhs[i] != magic_double_value);
            }
        }
        return ret;
    }

    void solve() {
        for (auto& thread: *this) {
            nrn_solve_minimal(&thread);
        }
    }

    NrnThread* begin() const {
        return nrn_threads;
    }
    NrnThread* end() const {
        return nrn_threads + nrn_nthread;
    }
};

template <typename... Args>
auto solve_and_dump(Args&&... args) {
    SetupThreads threads{std::forward<Args>(args)...};
    threads.solve();
    return threads.dump_solver_data();
}

auto active_implementations() {
    // These are always available
    std::vector<SolverImplementation> ret{SolverImplementation::CellPermute0_CPU,
                                          SolverImplementation::CellPermute1_CPU,
                                          SolverImplementation::CellPermute2_CPU};
#ifdef CORENEURON_ENABLE_GPU
    // Consider making these steerable via a runtime switch in GPU builds
    ret.push_back(SolverImplementation::CellPermute0_GPU);
    ret.push_back(SolverImplementation::CellPermute1_GPU);
    ret.push_back(SolverImplementation::CellPermute2_GPU);
    ret.push_back(SolverImplementation::CellPermute2_CUDA);
#endif
    return ret;
}

void compare_solver_data(
    std::map<SolverImplementation, std::vector<SolverData>> const& solver_data) {
    // CellPermute0_CPU is the simplest version of the solver, it should always
    // be present and it's a good reference to use
    constexpr auto ref_impl = SolverImplementation::CellPermute0_CPU;
    BOOST_REQUIRE(solver_data.find(ref_impl) != solver_data.end());
    auto const& ref_data = solver_data.at(ref_impl);
    for (auto const& [impl, impl_data]: solver_data) {
        // Must have compatible numbers of threads.
        BOOST_REQUIRE(impl_data.size() == ref_data.size());
        std::cout << "Comparing " << impl << " to " << ref_impl << std::endl;
        for (auto n_thread = 0ul; n_thread < impl_data.size(); ++n_thread) {
            // Must have compatible numbers of segments/data entries
            BOOST_REQUIRE(impl_data[n_thread].d.size() == ref_data[n_thread].d.size());
            BOOST_REQUIRE(impl_data[n_thread].parent_index.size() ==
                          ref_data[n_thread].parent_index.size());
            BOOST_REQUIRE(impl_data[n_thread].rhs.size() == ref_data[n_thread].rhs.size());
            BOOST_TEST(impl_data[n_thread].d == ref_data[n_thread].d,
                       boost::test_tools::per_element());
            BOOST_TEST(impl_data[n_thread].parent_index == ref_data[n_thread].parent_index,
                       boost::test_tools::per_element());
            BOOST_TEST(impl_data[n_thread].rhs == ref_data[n_thread].rhs,
                       boost::test_tools::per_element());
        }
    }
}

template <typename... Args>
auto compare_all_active_implementations(Args&&... args) {
    std::map<SolverImplementation, std::vector<SolverData>> solver_data;
    for (auto impl: active_implementations()) {
        solver_data[impl] = solve_and_dump(impl, std::forward<Args>(args)...);
    }
    compare_solver_data(solver_data);
    return solver_data;
}

// *Roughly* tuned to accomodate NVHPC 22.3 at -O0; the largest differences come
// from the pseudorandom seeded tests.
constexpr double default_tolerance = 2e-11;

// May need to add some different tolerances here
BOOST_AUTO_TEST_CASE(SingleCellAndThread, *utf::tolerance(default_tolerance)) {
    constexpr std::size_t segments = 32;
    ToyModelConfig config{};
    config.num_segments_per_cell = segments;
    auto const solver_data = compare_all_active_implementations(config);
    for (auto const& [impl, data]: solver_data) {
        BOOST_REQUIRE(data.size() == 1);  // nthreads
        BOOST_REQUIRE(data[0].d.size() == segments);
        BOOST_REQUIRE(data[0].parent_index.size() == segments);
        BOOST_REQUIRE(data[0].rhs.size() == segments);
    }
}

BOOST_AUTO_TEST_CASE(UnbalancedCellSingleThread, *utf::tolerance(default_tolerance)) {
    ToyModelConfig config{};
    config.num_segments_per_cell = 19;  // not a nice round number
    compare_all_active_implementations(config);
}

BOOST_AUTO_TEST_CASE(LargeCellSingleThread, *utf::tolerance(default_tolerance)) {
    ToyModelConfig config{};
    config.num_segments_per_cell = 4096;
    compare_all_active_implementations(config);
}

BOOST_AUTO_TEST_CASE(ManySmallCellsSingleThread, *utf::tolerance(default_tolerance)) {
    ToyModelConfig config{};
    config.num_cells = 1024;
    compare_all_active_implementations(config);
}

BOOST_AUTO_TEST_CASE(ManySmallCellsMultiThread, *utf::tolerance(default_tolerance)) {
    ToyModelConfig config{};
    config.num_cells = 1024;
    config.num_threads = 2;
    compare_all_active_implementations(config);
}

auto random_config() {
    std::mt19937_64 gen{42};
    ToyModelConfig config{};
    config.produce_a = [g = gen, d = std::normal_distribution{1.0, 0.1}](int icell,
                                                                         int iseg) mutable {
        return d(g);
    };
    config.produce_b = [g = gen, d = std::normal_distribution{7.0, 0.2}](int, int) mutable {
        return d(g);
    };
    config.produce_d = [g = gen, d = std::normal_distribution{-0.1, 0.01}](int, int) mutable {
        return d(g);
    };
    config.produce_rhs = [g = gen, d = std::normal_distribution{-15.0, 2.0}](int, int) mutable {
        return d(g);
    };
    return config;
}

BOOST_AUTO_TEST_CASE(LargeCellSingleThreadRandom, *utf::tolerance(default_tolerance)) {
    auto config = random_config();
    config.num_segments_per_cell = 4096;
    compare_all_active_implementations(config);
}

BOOST_AUTO_TEST_CASE(ManySmallCellsSingleThreadRandom, *utf::tolerance(default_tolerance)) {
    auto config = random_config();
    config.num_cells = 1024;
    compare_all_active_implementations(config);
}