Full Code of facebookincubator/dispenso for AI

main c787a8423663 cached

268 files

5.4 MB

1.4M tokens

2316 symbols

1 requests

Download .txt

Showing preview only (5,676K chars total). Download the full file or copy to clipboard to get everything.

Repository: facebookincubator/dispenso
Branch: main
Commit: c787a8423663
Files: 268
Total size: 5.4 MB

Directory structure:
gitextract_x94z546h/

├── .clang-format
├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug_report.yml
│   │   ├── config.yml
│   │   └── feature_request.yml
│   ├── PULL_REQUEST_TEMPLATE.md
│   └── workflows/
│       ├── build.yml
│       ├── codeql.yml
│       └── docs.yml
├── .gitignore
├── CHANGELOG.md
├── CMakeLists.txt
├── CMakePresets.json
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── benchmarks/
│   ├── CMakeLists.txt
│   ├── benchmark_common.h
│   ├── cascading_parallel_for_benchmark.cpp
│   ├── concurrent_vector_benchmark.cpp
│   ├── fast_math/
│   │   ├── CMakeLists.txt
│   │   ├── avx512_benchmarks.cpp
│   │   ├── avx_benchmarks.cpp
│   │   ├── benchmark_helpers.h
│   │   ├── benchmarks.cpp
│   │   ├── erf_benchmarks.cpp
│   │   ├── hwy_benchmarks.cpp
│   │   ├── neon_benchmarks.cpp
│   │   └── sse_benchmarks.cpp
│   ├── for_each_benchmark.cpp
│   ├── for_latency_benchmark.cpp
│   ├── future_benchmark.cpp
│   ├── graph_benchmark.cpp
│   ├── graph_scene_benchmark.cpp
│   ├── idle_pool_benchmark.cpp
│   ├── locality_benchmark.cpp
│   ├── nested_for_benchmark.cpp
│   ├── nested_pool_benchmark.cpp
│   ├── once_function_benchmark.cpp
│   ├── pipeline_benchmark.cpp
│   ├── pool_allocator_benchmark.cpp
│   ├── run_benchmarks.py
│   ├── rw_lock_benchmark.cpp
│   ├── simple_for_benchmark.cpp
│   ├── simple_pool_benchmark.cpp
│   ├── small_buffer_benchmark.cpp
│   ├── summing_for_benchmark.cpp
│   ├── tbb_compat.h
│   ├── thread_benchmark_common.h
│   ├── timed_task_benchmark.cpp
│   └── trivial_compute_benchmark.cpp
├── cmake/
│   └── DispensoConfig.cmake.in
├── codecov.yml
├── dispenso/
│   ├── CMakeLists.txt
│   ├── async_request.h
│   ├── completion_event.h
│   ├── concurrent_object_arena.h
│   ├── concurrent_vector.h
│   ├── detail/
│   │   ├── can_invoke.h
│   │   ├── completion_event_impl.h
│   │   ├── concurrent_vector_impl.h
│   │   ├── concurrent_vector_impl2.h
│   │   ├── epoch_waiter.h
│   │   ├── future_impl.h
│   │   ├── future_impl2.h
│   │   ├── graph_executor_impl.h
│   │   ├── math.h
│   │   ├── notifier_common.h
│   │   ├── once_callable_impl.h
│   │   ├── op_result.h
│   │   ├── per_thread_info.cpp
│   │   ├── per_thread_info.h
│   │   ├── pipeline_impl.h
│   │   ├── quanta.cpp
│   │   ├── quanta.h
│   │   ├── result_of.h
│   │   ├── rw_lock_impl.h
│   │   ├── small_buffer_allocator_impl.h
│   │   ├── task_set_impl.h
│   │   └── timed_task_impl.h
│   ├── dispenso.h
│   ├── fast_math/
│   │   ├── README.md
│   │   ├── detail/
│   │   │   ├── double_promote.h
│   │   │   └── fast_math_impl.h
│   │   ├── fast_math.h
│   │   ├── float_traits.h
│   │   ├── float_traits_avx.h
│   │   ├── float_traits_avx512.h
│   │   ├── float_traits_hwy.h
│   │   ├── float_traits_neon.h
│   │   ├── float_traits_x86.h
│   │   ├── simd.h
│   │   └── util.h
│   ├── for_each.h
│   ├── future.h
│   ├── graph.cpp
│   ├── graph.h
│   ├── graph_executor.cpp
│   ├── graph_executor.h
│   ├── latch.h
│   ├── once_function.h
│   ├── parallel_for.h
│   ├── pipeline.h
│   ├── platform.h
│   ├── pool_allocator.cpp
│   ├── pool_allocator.h
│   ├── priority.cpp
│   ├── priority.h
│   ├── resource_pool.h
│   ├── rw_lock.h
│   ├── schedulable.h
│   ├── small_buffer_allocator.cpp
│   ├── small_buffer_allocator.h
│   ├── small_vector.h
│   ├── spsc_ring_buffer.h
│   ├── task_set.cpp
│   ├── task_set.h
│   ├── third-party/
│   │   └── moodycamel/
│   │       ├── LICENSE.md
│   │       ├── README.txt
│   │       ├── blockingconcurrentqueue.h
│   │       ├── concurrentqueue.h
│   │       └── lightweightsemaphore.h
│   ├── thread_id.cpp
│   ├── thread_id.h
│   ├── thread_pool.cpp
│   ├── thread_pool.h
│   ├── timed_task.cpp
│   ├── timed_task.h
│   ├── timing.cpp
│   ├── timing.h
│   ├── tsan_annotations.cpp
│   ├── tsan_annotations.h
│   ├── util.h
│   └── utils/
│       └── graph_dot.h
├── docs/
│   ├── Doxyfile
│   ├── benchmarks/
│   │   ├── benchmark_results.md
│   │   ├── concurrent_vector_details.md
│   │   ├── concurrent_vector_tcmalloc_details.md
│   │   ├── for_latency_details.md
│   │   ├── future_details.md
│   │   ├── graph_details.md
│   │   ├── graph_scene_details.md
│   │   ├── idle_pool_details.md
│   │   ├── index.html
│   │   ├── nested_for_details.md
│   │   ├── nested_pool_details.md
│   │   ├── once_function_details.md
│   │   ├── pipeline_details.md
│   │   ├── pool_allocator_details.md
│   │   ├── rw_lock_details.md
│   │   ├── simple_for_details.md
│   │   ├── simple_pool_details.md
│   │   ├── small_buffer_details.md
│   │   ├── summing_for_details.md
│   │   ├── timed_task_details.md
│   │   └── trivial_compute_details.md
│   ├── building.md
│   ├── custom.css
│   ├── design/
│   │   ├── barrier_dispatch.md
│   │   ├── coroutines.md
│   │   ├── cpp20_concepts.md
│   │   ├── fast_math_roadmap.md
│   │   ├── parallel_algorithms.md
│   │   ├── release_checklist.md
│   │   └── roadmap.md
│   ├── getting_started.md
│   ├── groups.dox
│   ├── header.html
│   ├── mainpage.md
│   ├── migrating_from_openmp.md
│   ├── migrating_from_tbb.md
│   └── third-party/
│       └── doxygen-awesome/
│           ├── doxygen-awesome-darkmode-toggle.js
│           └── doxygen-awesome.css
├── examples/
│   ├── CMakeLists.txt
│   ├── concurrent_vector_example.cpp
│   ├── for_each_example.cpp
│   ├── future_example.cpp
│   ├── graph_example.cpp
│   ├── latch_example.cpp
│   ├── parallel_for_example.cpp
│   ├── pipeline_example.cpp
│   ├── resource_pool_example.cpp
│   └── task_set_example.cpp
├── results/
│   ├── android_arm64.json
│   ├── linux_x64.json
│   ├── macos_arm64.json
│   └── windows_x64.json
├── run_bench.bat
├── scripts/
│   ├── BENCHMARKING.md
│   ├── compare_benchmarks.py
│   ├── generate_charts.py
│   ├── generate_plotly_benchmarks.py
│   ├── run_benchmarks.py
│   ├── update_benchmarks.py
│   └── update_package_managers.py
└── tests/
    ├── CMakeLists.txt
    ├── async_request_test.cpp
    ├── chunked_for_test.cpp
    ├── completion_event_test.cpp
    ├── concurrent_object_arena_test.cpp
    ├── concurrent_vector_a_test.cpp
    ├── concurrent_vector_b_test.cpp
    ├── concurrent_vector_default_test.cpp
    ├── concurrent_vector_nocache_test.cpp
    ├── concurrent_vector_test_common.h
    ├── concurrent_vector_test_common_types.h
    ├── fast_math/
    │   ├── CMakeLists.txt
    │   ├── acos_test.cpp
    │   ├── asin_test.cpp
    │   ├── atan2_test.cpp
    │   ├── atan_test.cpp
    │   ├── avx512_test.cpp
    │   ├── avx_test.cpp
    │   ├── bivariate_ulp_eval.h
    │   ├── cbrt_test.cpp
    │   ├── cos_test.cpp
    │   ├── erf_test.cpp
    │   ├── eval.cpp
    │   ├── eval.h
    │   ├── exp10_test.cpp
    │   ├── exp2_test.cpp
    │   ├── exp_test.cpp
    │   ├── expm1_test.cpp
    │   ├── frexp_test.cpp
    │   ├── hwy_test.cpp
    │   ├── hypot_test.cpp
    │   ├── ldexp_test.cpp
    │   ├── log10_test.cpp
    │   ├── log1p_test.cpp
    │   ├── log2_test.cpp
    │   ├── log_test.cpp
    │   ├── neon_test.cpp
    │   ├── pow_test.cpp
    │   ├── pow_ulp_eval.cpp
    │   ├── simd_test_utils.h
    │   ├── sin_test.cpp
    │   ├── sincos_test.cpp
    │   ├── sinpi_test.cpp
    │   ├── sse_test.cpp
    │   ├── tan_test.cpp
    │   ├── tanh_test.cpp
    │   ├── test_main.cpp
    │   ├── ulp_eval.cpp
    │   └── util_test.cpp
    ├── for_each_test.cpp
    ├── forward_shared_pool.cpp
    ├── future_test.cpp
    ├── graph_test.cpp
    ├── greedy_for_ranges_test.cpp
    ├── greedy_for_test.cpp
    ├── latch_test.cpp
    ├── once_function_test.cpp
    ├── pipeline_test.cpp
    ├── pool_allocator_test.cpp
    ├── priority_test.cpp
    ├── resource_pool_test.cpp
    ├── rw_lock_test.cpp
    ├── shared_pool_test.cpp
    ├── small_buffer_allocator_test.cpp
    ├── small_vector_test.cpp
    ├── spsc_ring_buffer_test.cpp
    ├── task_set_test.cpp
    ├── test_tid.h
    ├── thread_id_test.cpp
    ├── thread_pool_test.cpp
    ├── timed_task_test.cpp
    ├── timing_test.cpp
    └── util_test.cpp

================================================
FILE CONTENTS
================================================

================================================
FILE: .clang-format
================================================
---
AccessModifierOffset: -1
AlignAfterOpenBracket: AlwaysBreak
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
AlignEscapedNewlinesLeft: true
AlignOperands:   false
AlignTrailingComments: false
AllowAllParametersOfDeclarationOnNextLine: false
AllowShortBlocksOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: Empty
AllowShortIfStatementsOnASingleLine: false
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: false
BinPackParameters: false
BraceWrapping:
  AfterClass:      false
  AfterControlStatement: false
  AfterEnum:       false
  AfterFunction:   false
  AfterNamespace:  false
  AfterObjCDeclaration: false
  AfterStruct:     false
  AfterUnion:      false
  BeforeCatch:     false
  BeforeElse:      false
  IndentBraces:    false
BreakBeforeBinaryOperators: None
BreakBeforeBraces: Attach
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
BreakAfterJavaFieldAnnotations: false
BreakStringLiterals: false
ColumnLimit:     100
CommentPragmas:  '^ IWYU pragma:'
ConstructorInitializerAllOnOneLineOrOnePerLine: true
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DerivePointerAlignment: false
DisableFormat:   false
ForEachMacros:   [ FOR_EACH_RANGE, FOR_EACH, ]
IncludeCategories:
  - Regex:           '^<.*\.h(pp)?>'
    Priority:        1
  - Regex:           '^<.*'
    Priority:        2
  - Regex:           '.*'
    Priority:        3
IndentCaseLabels: true
IndentWidth:     2
IndentWrappedFunctionNames: false
KeepEmptyLinesAtTheStartOfBlocks: false
MacroBlockBegin: ''
MacroBlockEnd:   ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBlockIndentWidth: 2
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: false
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Left
RawStringFormats:
  - Language:        TextProto
    Delimiters:
      - pb
ReflowComments:  true
SortIncludes:    true
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 1
SpacesInAngles:  false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard:        Cpp11
TabWidth:        4
UseTab:          Never
...


================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.yml
================================================
name: Bug Report
description: File a bug report
title: "[Bug]: "
labels: ["bug", "triage"]
assignees:
  - graphicsMan
body:
  - type: markdown
    attributes:
      value: |
        Thanks for taking the time to fill out this bug report!
  - type: input
    id: contact
    attributes:
      label: Contact Details
      description: How can we get in touch with you if we need more info?
      placeholder: ex. email@example.com
    validations:
      required: false
  - type: textarea
    id: what-happened
    attributes:
      label: What happened?
      description: Also tell us, what did you expect to happen?
      placeholder: Tell us what you see!
      value: "A bug happened!"
    validations:
      required: true
  - type: dropdown
    id: version
    attributes:
      label: Version
      description: What version of our software are you running?
      options:
        - 1.0 (Default)
        - latest (Edge)
    validations:
      required: true
  - type: checkboxes
    id: terms
    attributes:
      label: Code of Conduct
      description: By submitting this issue, you agree to follow our [Code of Conduct](https://example.com)
      options:
        - label: I agree to follow this project's Code of Conduct
          required: true


================================================
FILE: .github/ISSUE_TEMPLATE/config.yml
================================================
blank_issues_enabled: false


================================================
FILE: .github/ISSUE_TEMPLATE/feature_request.yml
================================================
name: Feature Request
description: File a feature request
title: "[Feature Request]: "
labels: ["feature", "request"]
assignees:
  - graphicsMan
body:
  - type: markdown
    attributes:
      value: |
        Thanks for taking the time to fill out this feature request!
  - type: input
    id: contact
    attributes:
      label: Contact Details
      description: How can we get in touch with you if we need more info?
      placeholder: ex. email@example.com
    validations:
      required: false
  - type: textarea
    id: whats-wanted
    attributes:
      label: What is the desired feature?
      description: Give some details
      value: "Details here"
    validations:
      required: true
  - type: checkboxes
    id: terms
    attributes:
      label: Code of Conduct
      description: By submitting this issue, you agree to follow our [Code of Conduct](https://example.com)
      options:
        - label: I agree to follow this project's Code of Conduct
          required: true


================================================
FILE: .github/PULL_REQUEST_TEMPLATE.md
================================================
# PR Details

<!--- Provide a general summary of your changes in the Title above -->

## Description

<!--- Describe your changes in detail -->

## Related Issue

<!--- Please link to the issue here: -->

## Motivation and Context

<!--- Why is this change required? What problem does it solve? -->

## Test Plan

<!--- Please describe in detail how you tested your changes. -->
<!--- Include details of your testing environment, and the tests you ran to -->
<!--- see how your change affects other areas of the code, etc. -->

## Types of changes

<!--- What types of changes does your code introduce? Put an `x` in all the boxes that apply: -->

- [ ] Docs change
- [ ] Refactoring
- [ ] Dependency upgrade
- [ ] Bug fix (non-breaking change which fixes an issue)
- [ ] New feature (non-breaking change which adds functionality)
- [ ] Breaking change (fix or feature that would cause existing functionality to change)

## Checklist

<!--- Go over all the following points, and put an `x` in all the boxes that apply. -->
<!--- If you're unsure about any of these, don't hesitate to ask. We're here to help! -->

- [ ] My code follows the code style of this project.
- [ ] I have run clang-format.
- [ ] My change requires a change to the documentation.
- [ ] I have updated the documentation accordingly.
- [ ] I have read the **CONTRIBUTING** document.
- [ ] I have added tests to cover my changes.
- [ ] All new and existing tests passed, including in ASAN and TSAN modes (if available on your platform).


================================================
FILE: .github/workflows/build.yml
================================================
name: Build and test

on:
  push:
    branches: [main]
  pull_request:
    branches: [main]

permissions:
  contents: read

env:
  CTEST_OUTPUT_ON_FAILURE: 1

jobs:
  #############################################################################
  # Core platform builds - strategic sampling of OS/compiler/standard/library
  #############################################################################
  build-matrix:
    name: ${{ matrix.name }}
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
      matrix:
        include:
          # Linux - GCC baseline
          - name: Linux GCC C++14
            os: ubuntu-latest
            cmake_args: -DCMAKE_CXX_STANDARD=14
          # Linux - Clang with C++20 concepts
          - name: Linux Clang C++20
            os: ubuntu-latest
            cc: clang
            cxx: clang++
            cmake_args: -DCMAKE_CXX_STANDARD=20
          # macOS - C++20 with static library
          - name: macOS C++20 static
            os: macos-latest
            cmake_args: -DCMAKE_CXX_STANDARD=20 -DDISPENSO_SHARED_LIB=OFF
          # Windows - MSVC baseline
          - name: Windows MSVC C++14
            os: windows-latest
            cmake_args: -DCMAKE_CXX_STANDARD=14
          # Windows - MSVC C++20 with static library
          - name: Windows MSVC C++20 static
            os: windows-latest
            cmake_args: -DCMAKE_CXX_STANDARD=20 -DDISPENSO_SHARED_LIB=OFF
    steps:
      - uses: actions/checkout@v4
      - name: Configure
        run: |
          mkdir build && cd build
          cmake .. -DDISPENSO_BUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Release ${{ matrix.cmake_args }}
        env:
          CC: ${{ matrix.cc }}
          CXX: ${{ matrix.cxx }}
      - name: Build
        working-directory: ./build
        run: cmake --build . --parallel 4 --config Release
      - name: Test
        working-directory: ./build
        run: ctest -LE flaky --build-config Release

  #############################################################################
  # Architecture-specific builds
  #############################################################################
  build-linux-x86:
    name: Linux x86 (32-bit)
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - name: Install 32-bit support
        run: sudo apt-get update && sudo apt-get install -y gcc-multilib g++-multilib
      - name: Configure
        run: |
          mkdir build && cd build
          cmake .. -DDISPENSO_BUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Release \
            -DCMAKE_CXX_FLAGS="-m32" -DCMAKE_C_FLAGS="-m32"
      - name: Build
        working-directory: ./build
        run: cmake --build . --parallel 4
      - name: Test
        working-directory: ./build
        run: ctest -LE flaky

  build-macos-gcc:
    name: macOS GCC C++14 (ARM64)
    runs-on: macos-latest
    steps:
      - uses: actions/checkout@v4
      - name: Set up GCC
        run: |
          brew list gcc || brew install gcc
          GCC_PREFIX=$(brew --prefix gcc)
          GCC_VER=$(ls "$GCC_PREFIX/bin"/gcc-* | grep -oE '[0-9]+$' | sort -n | tail -1)
          echo "CC=$GCC_PREFIX/bin/gcc-$GCC_VER" >> $GITHUB_ENV
          echo "CXX=$GCC_PREFIX/bin/g++-$GCC_VER" >> $GITHUB_ENV
      - name: Configure
        run: |
          mkdir build && cd build
          cmake .. -DDISPENSO_BUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Release \
            -DCMAKE_CXX_STANDARD=14
      - name: Build
        working-directory: ./build
        run: cmake --build . --parallel 4
      - name: Test
        working-directory: ./build
        run: ctest -LE flaky

  build-windows-x86:
    name: Windows x86 (32-bit)
    runs-on: windows-latest
    steps:
      - uses: actions/checkout@v4
      - name: Configure
        run: |
          mkdir build && cd build
          cmake .. -A Win32 -DDISPENSO_BUILD_TESTS=ON
      - name: Build
        working-directory: ./build
        run: cmake --build . --parallel 4 --config Release
      - name: Test
        working-directory: ./build
        run: ctest -LE flaky --build-config Release

  #############################################################################
  # Sanitizer builds - critical for concurrency library
  #############################################################################
  sanitizers:
    name: ${{ matrix.name }}
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        include:
          # TSan requires clang - GCC 13's TSan has spurious warnings about atomic_thread_fence
          - name: Thread Sanitizer
            cmake_args: -DTHREAD_SANITIZER=ON
            cc: clang
            cxx: clang++
          - name: Address Sanitizer
            cmake_args: -DADDRESS_SANITIZER=ON
    steps:
      - uses: actions/checkout@v4
      - name: Configure
        run: |
          mkdir build && cd build
          cmake .. -DDISPENSO_BUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Debug ${{ matrix.cmake_args }}
        env:
          CC: ${{ matrix.cc }}
          CXX: ${{ matrix.cxx }}
      - name: Build
        working-directory: ./build
        run: cmake --build . --parallel 4
      - name: Test
        working-directory: ./build
        run: ctest -LE flaky

  #############################################################################
  # Documentation build
  #############################################################################
  docs:
    name: Documentation
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - name: Install Doxygen
        run: sudo apt-get update && sudo apt-get install -y doxygen graphviz
      - name: Build documentation
        working-directory: ./docs
        run: doxygen Doxyfile
      - name: Check for warnings
        working-directory: ./docs
        run: |
          if [ -s doxygen_warnings.log ]; then
            echo "Doxygen warnings found:"
            cat doxygen_warnings.log
            exit 1
          fi
          echo "No Doxygen warnings."

  #############################################################################
  # Code coverage
  #############################################################################
  coverage:
    name: Code Coverage
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - name: Install lcov
        run: sudo apt-get update && sudo apt-get install -y lcov
      - name: Configure with coverage
        run: |
          mkdir build && cd build
          cmake .. -DDISPENSO_BUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Debug \
            -DCMAKE_CXX_FLAGS="--coverage -fprofile-arcs -ftest-coverage -fprofile-update=atomic" \
            -DCMAKE_C_FLAGS="--coverage -fprofile-arcs -ftest-coverage -fprofile-update=atomic" \
            -DCMAKE_EXE_LINKER_FLAGS="--coverage"
      - name: Build
        working-directory: ./build
        run: cmake --build . --parallel 4
      - name: Test (stable tests)
        working-directory: ./build
        run: ctest -LE flaky
      - name: Test (TimedTask, with retries)
        working-directory: ./build
        run: ctest -L flaky -R "^TimedTaskTest\." --repeat until-pass:5
      - name: Generate coverage report
        run: |
          lcov --capture --directory build --output-file coverage.info --ignore-errors mismatch,gcov,negative
          lcov --remove coverage.info '/usr/*' '*/build/_deps/*' '*/tests/*' '*/third-party/*' '*/benchmarks/*' '*/examples/*' '*/bits/*' '*/ext/*' --output-file coverage.info --ignore-errors unused
          lcov --list coverage.info
      - name: Upload to Codecov
        uses: codecov/codecov-action@v5
        with:
          files: coverage.info
          token: ${{ secrets.CODECOV_TOKEN }}
          fail_ci_if_error: false
          verbose: true


================================================
FILE: .github/workflows/codeql.yml
================================================
name: CodeQL

on:
  push:
    branches: [main]
  pull_request:
    branches: [main]
  schedule:
    - cron: '17 9 * * 1'

jobs:
  analyze:
    name: Analyze (C/C++)
    runs-on: ubuntu-latest
    permissions:
      security-events: write
      contents: read
    steps:
      - uses: actions/checkout@v4

      - name: Initialize CodeQL
        uses: github/codeql-action/init@v4
        with:
          languages: c-cpp

      - name: Build
        run: |
          cmake -S . -B build \
            -DDISPENSO_BUILD_TESTS=OFF \
            -DDISPENSO_BUILD_BENCHMARKS=OFF \
            -DCMAKE_BUILD_TYPE=Release
          cmake --build build --parallel 4

      - name: Perform CodeQL Analysis
        uses: github/codeql-action/analyze@v4


================================================
FILE: .github/workflows/docs.yml
================================================
name: Docs

on:
  push:
    branches: [ main ]

permissions:
  contents: write

jobs:
  build:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4

    - name: Set version from CHANGELOG
      run: |
        VERSION=$(grep -m1 '^[0-9]' CHANGELOG.md | sed 's/ .*//')
        sed -i "s/^PROJECT_NUMBER.*/PROJECT_NUMBER         = $VERSION/" docs/Doxyfile

    - name: Doxygen Action
      uses: mattnotmitt/doxygen-action@v1
      with:
        working-directory: "docs/"
        doxyfile-path: "./Doxyfile"


    - name: Fix Doxygen output permissions and copy benchmark dashboard
      run: |
        sudo chown -R $(id -u):$(id -g) docs/doxygen
        cp -r docs/benchmarks docs/doxygen/html/benchmarks

    - name: Deploy
      uses: peaceiris/actions-gh-pages@v4
      with:
        github_token: ${{ secrets.GITHUB_TOKEN }}
        publish_dir: ./docs/doxygen/html


================================================
FILE: .gitignore
================================================
# Compiled source #
###################
*.com
*.class
*.dll
*.exe
*.o
*.so
*.a
bin
lib

# Packages #
############
# it's better to unpack these files and commit the raw source
# git has its own built in compression methods
*.7z
*.dmg
*.gz
*.iso
*.jar
*.rar
*.tar
*.zip
/.project

# generated cmake files #
#########################
*CMakeCache.txt
*.log
*.make
*.cmake
CMakeFiles
Makefile
*Dir

/build/*
docs/doxygen/

# Clang #
#########
.cache/


================================================
FILE: CHANGELOG.md
================================================
1.5.1 (March 28, 2026)

### Bug fixes
* Fixed `__ulock_wait`/`__ulock_wake` usage on macOS versions prior to 10.12 and on PowerPC where these APIs are unavailable. The ulock path is now guarded behind a runtime version check with `pthread_cond` fallback.
* Fixed ARM64 Windows build failure: `notifier_common.h` incorrectly defined `_ARM_` (32-bit ARM) instead of `_ARM64_` on ARM64 Windows, causing `winnt.h` to reference missing 32-bit ARM intrinsics.
* Fixed `platform.h` version macros not being updated for 1.5.0 release (were stuck at 1.4.1)
* Removed vestigial `CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS` from CMakeLists.txt. All public APIs now use proper `DISPENSO_DLL_ACCESS` annotations; the blanket export is no longer needed and is prohibited by vcpkg's maintainer guide.

### Build system
* Added `DISPENSO_USE_SYSTEM_CONCURRENTQUEUE` CMake option to use system-installed `moodycamel::concurrentqueue` instead of bundled copy (default OFF), for vcpkg compatibility
* Export C++ standard requirement via `target_compile_features` so downstream consumers compile with at least the same standard dispenso was built with
* Respect `BUILD_SHARED_LIBS` for `DISPENSO_SHARED_LIB` default, allowing vcpkg to control static/shared linkage

### Infrastructure
* Added package manager release automation script (`scripts/update_package_managers.py`) with post-write checksum verification, platform-aware testing, and PR body templates following each repo's CONTRIBUTING.md
* Added CodeQL security analysis workflow scoped to main branch
* Added package manager badges (vcpkg, Conan, Homebrew, MacPorts) to README
* Added release checklist documentation

1.5.0 (March 22, 2026)

### New features
* Added `SmallVector` container with configurable inline storage, reducing heap allocations for small collections
* Added `SPSCRingBuffer` lock-free single-producer single-consumer ring buffer with power-of-two optimization
* Added `scheduleBulk(count, generator)` API to ThreadPool, TaskSet, and ConcurrentTaskSet for efficient bulk task submission with reduced atomic contention
* Added random-access iterator specialization for `for_each_n`, with iterator category dispatch for optimal chunk boundary computation
* Added Mac futex-based wakeup using `os_sync_wait_on_address` (macOS 14.4+) with `__ulock_wait` fallback
* Added C++20 concept constraints for better error messages when template requirements aren't met
* Added experimental `fast_math` sublibrary with SIMD-accelerated math functions including `log2`, `exp2`, `exp`, `exp10`, `cbrt`, `sin`, `cos`, `sincos`, `asin`, and `atan2` with configurable accuracy/performance trade-offs and multiple SIMD backends (SSE4.1, AVX2, AVX512, NEON, Highway). **API unstable** — gated by `DISPENSO_BUILD_FAST_MATH` CMake option
* Added benchmark runner and chart generation scripts with multi-platform support
* Added interactive Plotly.js benchmark dashboard generator

### Performance improvements
* ThreadPool atomics simplification: replaced 3 per-task tracking atomics with `numSleeping_` + batched `workRemaining_` decrements, reducing per-task atomic operations from 5-6 to ~1 (+24% geometric mean across 568 benchmark tests)
* ThreadPool wakeup heuristic: reduced futex calls from ~1M to ~11K by only waking when capacity cannot cover queued work; `mostly_idle` benchmark 2.6x faster
* Cache-line alignment for `poolLoadFactor_` and `numThreads_` to reduce false sharing (L1 cache miss rate 16.33% → 6.92% on `schedule()` hot path)
* Graph executor optimizations: `SmallVector` for node dependents, pre-reserve capacity, inline continuation (build_big_tree 1.95x faster, build_dep_chain 2.14x faster)
* Serial pipeline SPSC optimization: dedicated executor with ring buffers for fully-serial pipelines (~33% faster)
* Inline continuation for serial pipeline stages (scheduling overhead reduced ~3x)
* Bulk wakeup with threshold-based `wakeN()`/`wakeAll()` selection for efficient bulk scheduling
* `for_each_n` converted to `scheduleBulk`: 2.1x faster at 32 threads, 1.6x at 64 threads for 100M elements
* `parallel_for` kAuto bulk scheduling: trivial_compute 52ms → 19ms at 192 threads (matching TBB)
* ConcurrentVector: non-atomic buffer pointer cache for read-hot paths (disabled on ARM), inline asm `bsr` for `detail::log2` on x86, and platform-adaptive `bucketAndSubIndexForIndex` fast path
* `OnceFunction` devirtualized: replaced vtable-based dispatch with direct function pointer, eliminating indirect call overhead
* `TaskSet`/`ConcurrentTaskSet` noWait path: replaced `shared_ptr<Atomic>` with pool-allocated single-atomic chunk index, reducing allocation overhead

### Infrastructure
* Benchmark runner (`run_benchmarks.py`) with JSON output and machine info collection
* Chart generator (`generate_charts.py`) with specialized visualizations per benchmark suite
* Multi-platform benchmark composition (`update_benchmarks.py --compose`) for unified documentation
* Prefer system-installed GoogleTest, Taskflow, and TBB in CMake with FetchContent fallback
* Added oneTBB compatibility via `tbb_compat.h` wrapper for `task_scheduler_init`
* Added BUCK targets for `idle_pool_benchmark`, `nested_pool_benchmark`, `for_each_benchmark`, and `locality_benchmark`

### Documentation
* Added examples directory with compilable example programs for each feature
* Added Getting Started guide (`docs/getting_started.md`) with inline code snippets from examples
* Added OpenMP migration guide (`docs/migrating_from_openmp.md`)
* Improved README clarity, discoverability, and feature descriptions

### CI and build improvements
* Comprehensive CI matrix: 11 jobs covering 3 architectures (x64, x86, ARM64), 3 OSes, 3 compilers (GCC, Clang, MSVC), C++14/20, TSan/ASan, code coverage, and Doxygen builds
* Added codecov.yml for enforcing 92% code coverage threshold

### Bug fixes
* Fixed ABI mismatch between exception and no-exception builds: `TaskSetBase` and `FutureImplResultMember` had conditionally compiled members that shifted struct layout depending on `-fno-exceptions`, causing crashes when translation units disagreed. Exception-related data members are now always present in the layout (with zero runtime cost when exceptions are disabled). **Note:** this changes the ABI for builds that previously used `-fno-exceptions`; recompile all code against 1.5 headers if mixing exception modes.
* Fixed `ConcurrentTaskSet` parent stack overflow when tasks recursively schedule to the same task set: self-recursive inlining via `tryExecuteNext()` repeatedly pushed the same `TaskSetBase*` onto the thread-local parent stack (depth limit 64), causing an abort under heavy inlining. Fix skips redundant push/pop when the TaskSet is already the current parent.
* Fixed pipeline `kLimited` scheduler `wait()` losing late-arriving items: the LIMITED path only drained the local queue without waiting for in-flight items, so items enqueued by a previous stage's CTS task after the drain could be permanently orphaned. Fix replaces the drain-only loop with an `outstanding_`-based spin that ensures all items complete, with `tryExecuteNext()` to keep the calling thread productive.
* Fixed `parallel_for` with `kAuto` chunking incorrectly falling back to static chunking when `maxThreads` was left at default
* Fixed `NoOpIter` missing iterator trait typedefs for C++20 compliance
* Fixed `NoOpIter::operator*()` / `operator[]` static local data race
* Fixed SmallBufferAllocator unsigned underflow where `allocSmallBuffer<1/2/3>()` returned nullptr instead of a 4-byte block
* Fixed `cpuRelax()` being a no-op on MSVC (missing `_mm_pause()` / `__yield()` intrinsics)
* Fixed x86 Windows build issues
* Fixed Doxygen documentation warnings
* Fixed pipeline exception safety: exceptions thrown in pipeline stage functors are now caught and propagated to the caller via `ConcurrentTaskSet`. Added RAII guards for stage resource cleanup, `OnceFunction::cleanupNotRun()` for proper deallocation of unexecuted tasks, and a deadlock fix in the `kLimited` scheduler's resource spin loop when exceptions leave no threads to release resources.
* Fixed MSVC lambda capture for constexpr variable
* Fixed `idle_pool_benchmark` fairness (loop bound, static scheduling, and pool placement)
* Fixed `nested_for_benchmark` incorrect loop bound, static scheduling, and pool placement

### Test improvements
* Added comprehensive tests for thread_pool spin-poll with sleep mode
* Added comprehensive task_set edge case tests
* Added comprehensive tests for concurrent_object_arena
* Added edge case tests for pool_allocator
* Added timing tests for getTime() function
* Added tests for Graph/Subgraph accessors and BiPropNode edge cases
* Added `SmallVector` test suite (43 tests)
* Added `SPSCRingBuffer` test suite (47 tests)
* Improved overall test coverage from ~89% to 96.3% (dispenso source only, excluding stdlib and third-party)

1.4.1 (January 5, 2026)

### Bug fixes and build improvements
* Fixed clock frequency calculation for mac-arm platforms
* Addressed potential race condition at TimedTaskScheduler construction
* Adjusted build platforms for better compatibility

1.4 (January 2, 2025)

### Efficiency improvements, bug and warning fixes
* Added some benchmarks and comparison with TaskFlow (thanks andre-nguyen!)
* Fixed compilation when compiling with DISPENSO_DEBUG (thanks EscapeZero!)
* Improved efficiency on Linux for infrequent thread pool usage.  Reduces polling overhead by 10x by switching to event-based wakeup instead of spin polling.
* Fix C++20 compilation issues (thanks aavbsouza!)
* Fix several build warnings (thanks SeaOtocinclus!)
* Add conda package badge, disable gtest install (thanks JeongSeok Lee!)
* Solved rare post-main shutdown issues with NewThreadInvoker
* Fixed test issues for 32-bit builds
* Fixed broken test logic for test thread IDs
* Fixed various build warnings

1.3 (April 25, 2024)

### Bug fixes, portability enhancements, and small functionality enhancements

* Fixed several generic warnings (thanks michel-slm!)
* cpuRelax added for PowerPC and ARM (thanks barracuda156!)
* Added missing header (thanks ryandesign!)
* Try to detect and add libatomic when required (thanks for discussions barracuda156!)
* Enable small buffers from small buffer allocators to go down to 4 bytes (thanks for discussion David Caruso!).  This is handy for 32-bit builds where pointers are typically 4 bytes
* Ensure that NOMINMAX is propagated for CMake Windows builds (thanks SeaOtocinclus!)
* Fix some cases using std::make_shared for types requiring large alignment, which is a bug prior to C++17 (thanks for help finding these SeaOtocinclus!)
* Set up CI on GitHub Actions, including builds for Mac and Windows in addition to Linux (thanks SeaOtocinclus!)
* Add an envinronment variable `DISPENSO_MAX_THREADS_PER_POOL` to limit max number of threads available to any thread pool.  In the spirit of `OMP_NUM_THREADS`.  (thanks Yong-Chull Jang!)
* Slight change of behavior w.r.t. use of `maxThreads` option in `ForEachOptions` and `ParForOptions` to limit concurrency the same way in both blocking and non-blocking `for_each` and `parallel_for` (thanks Arnie Yuan!)
* Various fixes to enable CMake builds on various 32-bit platforms (thanks for discussions barracuda156!)
* Updates to README

Known Issues:
* Large subset of dispenso tests are known to fail on 32-bit PPC Mac.  If you have access to such a machine and are willing to help debug, it would be appreciated!
* NewThreadInvoker can have a program shutdown race on Windows platforms if the threads launched by it are not finished running by end of main()

1.2 (December 27, 2023)

### Bug fixes and functionality enhancements

* Several small bug fixes, especially around 32-bit builds and at-exit shutdown corner cases, and TSAN finding benign races and/or causing timeout due to pathological lock-free behaviors in newer versions of TSAN
* Improve accuracy of `dispenso::getTime`
* Add C++-20-like `Latch` functionality
* Add mechanism for portable thread priorities
* Add a timed task/periodically scheduled task feature.  Average and standard deviation of the accuracy of `dispenso::TimedTaskScheduler` are both much better than `folly::FunctionScheduler` (from 2x to 10x+ depending on settings and platform)
* Enhancements to `parallel_for`
  * Add an option that allows to automatically reduce the number of threads working on a range if the work is too cheap to justify parallelization.  This can result in 3000x+ speedups for very lightweight loops
  * Resuse per-thread state containers across parallel for calls (these must block in-between, or be thread-safe types)
  * `parallel_for` functors may now be called with an input range directly instead of requiring a ChunkedRange.  This is as simple as providing a functor/lambda that takes the additional argument, just as was previously done with `ChunkedRange`.  `ChunkedRange`s still work, and this is fully backward compatible
* `ThreadPool`s have a new option for full spin polling.  This is generally best avoided, and I'd argue never to use this for the default Global thread pool, but can be useful for a subset of threads in systems that require real-time responsivity (especially, can be combined with the thread priority feature also found in this release)
* Task graph execution (thanks Roman Fedotov!).  Building and running dispenso task graphs is typically 25% faster than the (already excellent) `TaskFlow` library in our benchmarks.  Additionally, we have a partial update feature that can enable much faster (e.g. 50x faster) execution in cases where only a small percentage of task inputs are updated (think of per-frame partial scene updates in a game)

1.1 (October 1, 2022)

### Performance and functionality enhancements

* CMake changes to allow install of targets and CMake dispenso target exports (thanks jeffamstutz!)
* Addition of typical container type definitions for ConcurrentVector (thanks Michael Jung!)
* Large performance improvements for Futures and CompletionEvents on MacOs.  Resulted in order-of-magnitude speedups for those use cases on MacOs.
* Addition of new benchmark for performance with infrequent use of `parallel_for`, `for_latency_benchmark`
* Fixes to ensure `parallel_for` works with thread pools with zero threads (thanks kevinbchen!).  Further work has been done to ensure that thread pools with zero threads simply always run code inline.
* By default, the global thread pool uses one fewer thread than the machine has hardware threads.  This behavior was introduced because dispenso very often runs on the calling thread as well as pool threads, and so one fewer thread in the pool can lead to better performance.
* Update googletest version to 1.12.1 (thanks porumbes!)
* Add a utility in dispenso to get a thread ID, `threadId`.  These 64-bit IDs are unique per thread, and will not be recyled.  These values grow from zero, ensuring the caller can assume they are small if number of threads also is small (e.g. you won't have an ID of `0xdeadbeef` if you only run hundreds or thousands of threads in the lifetime of the process).
* Add a utility, `getTime`, to get time quickly.  This provides the double-precision time in seconds since the first call to `getTime` after process start.
* Use a new scheduling mechanism in the thread pool when in Windows.  This resulted in up to a 13x improvement in latency between putting items in the pool and having those items run.  This scheduling is optional, but turned off for Linux and MacOs since scheduling was already fast on those platforms.
* Optimizations to enable faster scheduling in thread pools.  This resulted in a range of 5% to 45% speedup across multiple benchmarks including `future_benchmark` and `pipeline_benchmark`.
* Fixed a performance bug in work stealing logic; now dispenso outperforms TBB in the `pipeline_benchmark`
* Added a task set cancellation feature, with a relatively simple mechanism for submitted work to check if it's owning task set has been cancelled.  When creating a task set, you can optionally opt into parent cancellation propagation as well.  While this propagation is fairly efficient, it did create a noticeable impact on performance in some cases, and thus it was decided to allow this behavior, but not penalize performance for those who don't need the behavior.

1.0 (November 24, 2021)

### dispenso initial release


================================================
FILE: CMakeLists.txt
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.


cmake_minimum_required(VERSION 3.12)

# Use /Z7 (embedded debug info) instead of /Zi (PDB server) on MSVC+Ninja.
# /Zi requires mspdbsrv.exe which can fail with C1902 "Program database manager
# mismatch" when Ninja launches cl.exe as a subprocess.
if(POLICY CMP0141)
  cmake_policy(SET CMP0141 NEW)
  set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT "$<$<CONFIG:Debug,RelWithDebInfo>:Embedded>")
endif()

# CMake 4.x enables C++20 module scanning by default with Ninja, which breaks
# try_compile() checks. Dispenso doesn't use C++20 modules.
if(POLICY CMP0155)
  cmake_policy(SET CMP0155 OLD)
endif()
set(CMAKE_CXX_SCAN_FOR_MODULES OFF)

project(
  Dispenso
  VERSION 1.5.1
  DESCRIPTION "Dispenso is a library for working with sets of parallel tasks"
  LANGUAGES CXX)

if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
  set(DISPENSO_STANDALONE TRUE)
else()
  set(DISPENSO_STANDALONE FALSE)
endif()

if (DISPENSO_STANDALONE)
  include(GNUInstallDirs)
endif()

list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/modules)

# Main project setup
if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
  set(CMAKE_CXX_EXTENSIONS OFF)
  set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)

  # Default to BUILD_SHARED_LIBS when set (e.g. by vcpkg), otherwise ON.
  if(DEFINED BUILD_SHARED_LIBS)
    option(DISPENSO_SHARED_LIB "Build Dispenso shared library" ${BUILD_SHARED_LIBS})
  else()
    option(DISPENSO_SHARED_LIB "Build Dispenso shared library" ON)
  endif()

endif()

option(ADDRESS_SANITIZER "Use Address Sanitizer, incompatible with THREAD_SANITIZER" OFF)
option(THREAD_SANITIZER "Use Thread Sanitizer, incompatible with ADDRESS_SANITIZER" OFF)

if (ADDRESS_SANITIZER)
  add_compile_options(-fsanitize=address -fsanitize=undefined)
  add_link_options(-fsanitize=address -fsanitize=undefined)
elseif (THREAD_SANITIZER)
  add_compile_options(-fsanitize=thread)
  add_link_options(-fsanitize=thread)
endif()

set(CMAKE_CXX_STANDARD 14 CACHE STRING "the C++ standard to use for this project")

set(DISPENSO_USE_SYSTEM_CONCURRENTQUEUE OFF CACHE BOOL
    "Use system-installed moodycamel::concurrentqueue instead of bundled copy")

###########################################################
# Targets
add_subdirectory(dispenso)

set(DISPENSO_BUILD_TESTS OFF CACHE BOOL "Should tests be built?")
set(DISPENSO_BUILD_BENCHMARKS OFF CACHE BOOL "Should benchmarks be built?")
set(DISPENSO_BUILD_EXAMPLES OFF CACHE BOOL "Should examples be built?")
set(DISPENSO_BUILD_FAST_MATH OFF CACHE BOOL "Build experimental fast_math sublibrary (API unstable)")
set(DISPENSO_FAST_MATH_SIMD "none" CACHE STRING "SIMD ISA for fast_math targets: none, native, sse4.1, avx2, avx512, neon")
set(DISPENSO_FAST_MATH_HIGHWAY OFF CACHE BOOL "Enable Highway SIMD backend for fast_math (fetches Highway if not found)")

if(DISPENSO_BUILD_FAST_MATH)
  # Set compiler flags for the chosen SIMD backend.
  # Each level is cumulative: avx2 implies sse4.1, avx512 implies avx2+sse4.1.
  # "native" auto-detects all ISA extensions supported by the build machine.
  set(DISPENSO_FAST_MATH_SIMD_FLAGS "")
  if(DISPENSO_FAST_MATH_SIMD STREQUAL "native")
    if(MSVC)
      # MSVC doesn't have -march=native; AVX2 is the best portable flag.
      set(DISPENSO_FAST_MATH_SIMD_FLAGS /arch:AVX2)
    else()
      set(DISPENSO_FAST_MATH_SIMD_FLAGS -march=native)
    endif()
  elseif(DISPENSO_FAST_MATH_SIMD STREQUAL "sse4.1")
    if(NOT MSVC)
      # MSVC enables SSE4.1 by default on x64.
      set(DISPENSO_FAST_MATH_SIMD_FLAGS -msse4.1)
    endif()
  elseif(DISPENSO_FAST_MATH_SIMD STREQUAL "avx2")
    if(MSVC)
      set(DISPENSO_FAST_MATH_SIMD_FLAGS /arch:AVX2)
    else()
      set(DISPENSO_FAST_MATH_SIMD_FLAGS -msse4.1 -mavx2 -mfma)
    endif()
  elseif(DISPENSO_FAST_MATH_SIMD STREQUAL "avx512")
    if(MSVC)
      set(DISPENSO_FAST_MATH_SIMD_FLAGS /arch:AVX512)
    else()
      set(DISPENSO_FAST_MATH_SIMD_FLAGS -msse4.1 -mavx2 -mfma -mavx512f -mavx512bw -mavx512dq)
    endif()
  elseif(DISPENSO_FAST_MATH_SIMD STREQUAL "neon")
    # NEON is implicit on aarch64; no extra flags needed.
  elseif(NOT DISPENSO_FAST_MATH_SIMD STREQUAL "none")
    message(FATAL_ERROR "Unknown DISPENSO_FAST_MATH_SIMD value: ${DISPENSO_FAST_MATH_SIMD}. "
      "Valid options: none, native, sse4.1, avx2, avx512, neon")
  endif()

  # Find or fetch Highway if requested.
  if(DISPENSO_FAST_MATH_HIGHWAY)
    find_package(hwy QUIET)
    if(NOT hwy_FOUND)
      include(FetchContent)
      FetchContent_Declare(
        highway
        GIT_REPOSITORY https://github.com/google/highway.git
        GIT_TAG        1.2.0
      )
      set(HWY_ENABLE_TESTS OFF CACHE BOOL "" FORCE)
      set(HWY_ENABLE_EXAMPLES OFF CACHE BOOL "" FORCE)
      set(HWY_ENABLE_CONTRIB OFF CACHE BOOL "" FORCE)
      FetchContent_MakeAvailable(highway)
    endif()
  endif()
endif()

if(DISPENSO_BUILD_TESTS)
  enable_testing()
  add_subdirectory(tests)
endif()

if(DISPENSO_BUILD_BENCHMARKS)
  # Sadly any given release of folly seems to have some problem or another.  Leave disabled by default.
  set(BENCHMARK_WITHOUT_FOLLY ON CACHE BOOL "Should folly benchmarks be disabled?")
  add_subdirectory(benchmarks)
endif()

if(DISPENSO_BUILD_EXAMPLES)
  add_subdirectory(examples)
endif()


================================================
FILE: CMakePresets.json
================================================
{
  "version": 6,
  "cmakeMinimumRequired": {
    "major": 3,
    "minor": 21,
    "patch": 0
  },
  "configurePresets": [
    {
      "name": "default",
      "displayName": "Default (system packages)",
      "description": "Basic build without vcpkg. Uses system-installed dependencies.",
      "binaryDir": "${sourceDir}/build/${presetName}",
      "cacheVariables": {
        "CMAKE_CXX_STANDARD": "17"
      }
    },
    {
      "name": "vcpkg",
      "displayName": "vcpkg",
      "description": "Use vcpkg for dependencies (TBB, GoogleTest, benchmark, etc.)",
      "binaryDir": "${sourceDir}/build/${presetName}",
      "toolchainFile": "$env{VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake",
      "cacheVariables": {
        "CMAKE_CXX_STANDARD": "17"
      }
    },
    {
      "name": "dev",
      "displayName": "Development (tests + benchmarks)",
      "description": "Build everything for development: tests, benchmarks, and examples.",
      "inherits": "vcpkg",
      "cacheVariables": {
        "DISPENSO_BUILD_TESTS": "ON",
        "DISPENSO_BUILD_BENCHMARKS": "ON",
        "DISPENSO_BUILD_EXAMPLES": "ON",
        "CMAKE_CXX_STANDARD": "20"
      }
    },
    {
      "name": "win-dev",
      "displayName": "Windows Development (Ninja)",
      "description": "Windows dev build using Ninja for fast single-config builds. Run from a VS Developer Command Prompt.",
      "inherits": "dev",
      "binaryDir": "$env{TEMP}/dispenso-build/${presetName}",
      "generator": "Ninja",
      "cacheVariables": {
        "CMAKE_BUILD_TYPE": "Release",
        "VCPKG_APPLOCAL_DEPS": "OFF"
      },
      "condition": {
        "type": "equals",
        "lhs": "${hostSystemName}",
        "rhs": "Windows"
      }
    },
    {
      "name": "asan",
      "displayName": "Address Sanitizer",
      "inherits": "dev",
      "cacheVariables": {
        "ADDRESS_SANITIZER": "ON"
      }
    },
    {
      "name": "tsan",
      "displayName": "Thread Sanitizer",
      "inherits": "dev",
      "cacheVariables": {
        "THREAD_SANITIZER": "ON"
      }
    }
  ],
  "buildPresets": [
    {
      "name": "default",
      "configurePreset": "default",
      "configuration": "Release"
    },
    {
      "name": "dev",
      "configurePreset": "dev",
      "configuration": "Release"
    },
    {
      "name": "win-dev",
      "configurePreset": "win-dev"
    }
  ],
  "testPresets": [
    {
      "name": "dev",
      "configurePreset": "dev",
      "configuration": "Release",
      "output": {
        "outputOnFailure": true
      }
    },
    {
      "name": "win-dev",
      "configurePreset": "win-dev",
      "output": {
        "outputOnFailure": true
      }
    }
  ]
}


================================================
FILE: CODE_OF_CONDUCT.md
================================================
# Code of Conduct

## Our Pledge

In the interest of fostering an open and welcoming environment, we as
contributors and maintainers pledge to make participation in our project and
our community a harassment-free experience for everyone, regardless of age, body
size, disability, ethnicity, sex characteristics, gender identity and expression,
level of experience, education, socio-economic status, nationality, personal
appearance, race, religion, or sexual identity and orientation.

## Our Standards

Examples of behavior that contributes to creating a positive environment
include:

* Using welcoming and inclusive language
* Being respectful of differing viewpoints and experiences
* Gracefully accepting constructive criticism
* Focusing on what is best for the community
* Showing empathy towards other community members

Examples of unacceptable behavior by participants include:

* The use of sexualized language or imagery and unwelcome sexual attention or
  advances
* Trolling, insulting/derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or electronic
  address, without explicit permission
* Other conduct which could reasonably be considered inappropriate in a
  professional setting

## Our Responsibilities

Project maintainers are responsible for clarifying the standards of acceptable
behavior and are expected to take appropriate and fair corrective action in
response to any instances of unacceptable behavior.

Project maintainers have the right and responsibility to remove, edit, or
reject comments, commits, code, wiki edits, issues, and other contributions
that are not aligned to this Code of Conduct, or to ban temporarily or
permanently any contributor for other behaviors that they deem inappropriate,
threatening, offensive, or harmful.

## Scope

This Code of Conduct applies within all project spaces, and it also applies when
an individual is representing the project or its community in public spaces.
Examples of representing a project or community include using an official
project e-mail address, posting via an official social media account, or acting
as an appointed representative at an online or offline event. Representation of
a project may be further defined and clarified by project maintainers.

## Enforcement

Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported by contacting the project team at <opensource-conduct@fb.com>. All
complaints will be reviewed and investigated and will result in a response that
is deemed necessary and appropriate to the circumstances. The project team is
obligated to maintain confidentiality with regard to the reporter of an incident.
Further details of specific enforcement policies may be posted separately.

Project maintainers who do not follow or enforce the Code of Conduct in good
faith may face temporary or permanent repercussions as determined by other
members of the project's leadership.

## Attribution

This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html

[homepage]: https://www.contributor-covenant.org

For answers to common questions about this code of conduct, see
https://www.contributor-covenant.org/faq


================================================
FILE: CONTRIBUTING.md
================================================
# Contributing to dispenso
We want to make contributing to this project as easy and transparent as
possible.  There is a design ethos behind the library, so it is recommended to reach out via a GitHub 
issue on the project to discuss non-trivial changes you may wish to make.  These changes include, for 
example, wanting to change existing API, wanting to furnish a new utility, or wanting to change 
underlying behavior substantially.  Let's avoid situations where you put in a lot of hard work, only 
to have to change it substantially or get your pull request rejected.

## Our Development Process
This library has another home inside Facebook repos.  From there it is subjected to regular continuous integration testing on many platforms, and used by many projects.

## Pull Requests
We actively welcome your pull requests.

1. Fork the repo and create your branch from `master`.
2. If you've added code that should be tested, add tests.
3. If you've changed APIs, update the documentation.
4. Ensure the test suite passes.
5. Utilize clang-format.
6. If you haven't already, complete the Contributor License Agreement ("CLA").

## Contributor License Agreement ("CLA")
In order to accept your pull request, we need you to submit a CLA. You only need
to do this once to work on any of Facebook's open source projects.

Complete your CLA here: <https://code.facebook.com/cla>

## Issues
We use GitHub issues to track public bugs. Please ensure your description is
clear and has sufficient instructions to be able to reproduce the issue.

Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
disclosure of security bugs. In those cases, please go through the process
outlined on that page and do not file a public issue.

## Coding Style  
* 2 spaces for indentation rather than tabs
* 100 character line length
* Member variables have trailing underscore_
* BigCamelCase for classes and structs, and smallCamelCase for functions and variables (exception is if you are trying to match a substantial part of a standard library interface).
* [1TBS braces](https://en.wikipedia.org/wiki/Indentation_style#Variant:_1TBS_(OTBS))
* Most of all, try to be consistent with the surrounding code.  We have automated tools that will
  enforce clang-format style for some files (e.g. the C++ core) once we import your pull request
  into our internal code reviewing tools.

## License
By contributing to dispenso, you agree that your contributions will be licensed
under the LICENSE.md file in the root directory of this source tree.


================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) Facebook, Inc. and its affiliates.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: README.md
================================================
[![Build and test](https://github.com/facebookincubator/dispenso/actions/workflows/build.yml/badge.svg)](https://github.com/facebookincubator/dispenso/actions/workflows/build.yml)
[![Documentation](https://img.shields.io/badge/docs-online-blue)](https://facebookincubator.github.io/dispenso)
[![codecov](https://codecov.io/gh/facebookincubator/dispenso/branch/main/graph/badge.svg)](https://codecov.io/gh/facebookincubator/dispenso)
[![Conan Center](https://img.shields.io/conan/v/dispenso)](https://conan.io/center/recipes/dispenso)
[![vcpkg](https://img.shields.io/vcpkg/v/dispenso)](https://vcpkg.io/en/package/dispenso)
[![Homebrew](https://img.shields.io/homebrew/v/dispenso)](https://formulae.brew.sh/formula/dispenso)
[![MacPorts](https://img.shields.io/badge/macports-dispenso-blue)](https://ports.macports.org/port/dispenso/)
[![Anaconda-Server Badge](https://anaconda.org/conda-forge/dispenso/badges/version.svg)](https://anaconda.org/conda-forge/dispenso)

# Dispenso

**A high-performance C++ thread pool and parallel algorithms library**

Dispenso is a modern **C++ parallel computing library** that provides work-stealing thread pools, parallel for loops, futures, task graphs, and concurrent containers. It serves as a powerful **alternative to OpenMP and Intel TBB**, offering better nested parallelism, sanitizer-clean code, and explicit thread pool control. Dispenso is used in hundreds of projects at Meta (formerly Facebook) and has been heavily tested and iterated on in production.

**Key advantages over OpenMP and TBB:**
- **No thread explosion** with nested parallel loops - dispenso's work-stealing prevents deadlocks and oversubscription
- **Clean with ASAN/TSAN** - fully sanitizer-compatible, unlike many TBB versions
- **Thread-safe shared futures** - `std::experimental::shared_future`-like API that TBB lacks, safe for multiple concurrent waiters, with much better performance than `std::future`
- **Portable** - C++14 compatible with no compiler-specific pragmas or extensions; C++20 builds gain concept constraints for clearer error messages

## Table of Contents

- [Choose Dispenso If...](#choosedispenso)
- [Features](#features)
- [Quick Start](#quickstart)
- [Comparison vs Other Libraries](#comparison)
- [Migration Guides](#migrationguides)
- [When Not to Use Dispenso](#nottouse)
- [Documentation and Examples](#examples)
- [Benchmark Results](#benchresults)
- [Installing](#installing)
- [Building](#building)
- [Known Issues](#knownissues)
- [License](#license)

<div id='choosedispenso'/>

## Choose Dispenso If...

- You need **nested parallelism** without thread explosion
- You want **sanitizer-clean** (ASAN/TSAN) concurrent code
- You want **explicit control over thread pools** rather than implicit global state
- You need **compute-bound futures**, not I/O-bound async
- You want **stable APIs** and minimal dependencies
- You need **cross-platform portability** from a C++14 baseline
- You have **multiple independent parallel loops** that can overlap (cascading `parallel_for`)

<div id='features'/>

## Features

Dispenso provides a comprehensive set of parallel programming primitives:

**Core runtime:**
* **[`ThreadPool`](https://facebookincubator.github.io/dispenso/classdispenso_1_1_thread_pool.html)** — work-stealing thread pool backing all dispenso parallelism
* **[`TaskSet`](https://facebookincubator.github.io/dispenso/classdispenso_1_1_task_set.html) / [`ConcurrentTaskSet`](https://facebookincubator.github.io/dispenso/classdispenso_1_1_concurrent_task_set.html)** — task grouping with wait, cancellation, and recursive scheduling

**Parallel algorithms:**
* **[`parallel_for`](docs/getting_started.md#your-first-parallel-loop)** — parallel loops over indices, blocking or non-blocking (cascaded); cascading `parallel_for` enables overlapping independent loops without oversubscription
* **[`for_each`](docs/getting_started.md#parallel-iteration-with-for_each)** — parallel `std::for_each` / `std::for_each_n`
* **[`Future`](docs/getting_started.md#futures-for-async-results)** — high-performance thread-safe shared futures with `then()`, `when_all()`, and an API matching `std::experimental::shared_future`
* **[`Graph`](docs/getting_started.md#task-graphs)** — task graph execution with subgraph support and incremental re-evaluation
* **[`pipeline`](docs/getting_started.md#pipelines)** — parallel pipelining of streaming workloads

**Concurrent containers and synchronization:**
* **[`ConcurrentVector`](docs/getting_started.md#concurrentvector)** — concurrent growable vector, superset of TBB `concurrent_vector` API
* **[`Latch`](docs/getting_started.md#latch)** — one-shot barrier for thread synchronization
* **[`RWLock`](https://facebookincubator.github.io/dispenso/classdispenso_1_1_r_w_lock.html)** — reader-writer spin lock, outperforms `std::shared_mutex` under low write contention
* **`SPSCRingBuffer`** — lock-free single-producer single-consumer ring buffer *(1.5.0)*

**General-purpose utilities:**
* **`SmallVector`** — inline-storage vector (not thread-aware; similar to `folly::small_vector`) *(1.5.0)*
* **`OnceFunction`** — lightweight move-only `void()` callable
* **`PoolAllocator`** — pool allocator with pluggable backing allocation (e.g. CUDA)
* **`SmallBufferAllocator`** — fast concurrent allocation for temporary objects
* **[`ResourcePool`](docs/getting_started.md#resource-pooling)** — semaphore-like guard around pooled resources
* **`CompletionEvent`** — notifiable event with wait and timed wait
* **`AsyncRequest`** — lightweight constrained message passing
* **`ConcurrentObjectArena`** — fast same-type object arena

<div id='quickstart'/>

## Quick Start

**Parallel for loop** - the most common use case:

```cpp
#include <dispenso/parallel_for.h>

// Sequential
for (size_t i = 0; i < N; ++i) {
    process(data[i]);
}

// Parallel with dispenso - just wrap it!
dispenso::parallel_for(0, N, [&](size_t i) {
    process(data[i]);
});
```

**Install via your favorite package manager:**

```bash
# Conda
conda install -c conda-forge dispenso

# Fedora/RHEL
sudo dnf install dispenso-devel

# Or build from source (see below)
```

<div id='comparison'/>

## Comparison vs Other Libraries

### TBB (Intel Threading Building Blocks)

TBB has more functionality overall, but we built dispenso for three reasons:
1. **Sanitizer compatibility** — TBB doesn't work well with ASAN/TSAN
2. **Thread-safe shared futures** — TBB lacks a futures interface; dispenso provides `std::experimental::shared_future`-like futures safe for multiple concurrent waiters
3. **Non-Intel hardware** — we needed to control performance on diverse platforms

**Performance:** Dispenso tends to be faster for small and medium parallel loops, and on par for large ones. When many loops run independently, dispenso's cascading `parallel_for` avoids oversubscription and has delivered **32-50% speedups in production workloads** after porting from TBB at Meta. TBB lacks an equivalent mechanism.

See [Migrating from TBB](docs/migrating_from_tbb.md) for a step-by-step porting guide.

### OpenMP

OpenMP has simple syntax for basic loops but grows complex for advanced constructs. Nested `#pragma omp parallel for` inside threaded code risks thread explosion and machine exhaustion. Dispenso outperforms OpenMP for medium and large loops. OpenMP has an advantage for very small loops due to direct compiler support, though dispenso's `minItemsPerChunk` option can close this gap by tuning the parallelism threshold for small/fast loops.

See [Migrating from OpenMP](docs/migrating_from_openmp.md) for a step-by-step porting guide.

### Folly

Folly excels at asynchronous I/O with coroutine support. Dispenso is designed for **compute-bound** work. Dispenso's futures are lighter-weight and faster for compute workloads; Folly is the better choice for I/O-heavy applications.

### TaskFlow

TaskFlow focuses on task graph execution. Dispenso has faster graph construction, faster full and partial graph execution, much lower `parallel_for` overhead (10-100x in benchmarks), and simpler/faster pipeline construction. TaskFlow does offer CUDA graph mappings, which dispenso does not currently provide.

### Others (GCD, C++ std parallelism)

GCD is Apple-specific with ports to other platforms. C++ parallel algorithms are still evolving — we are interested in enabling dispenso as a backend for `std::execution` and C++ coroutines. Contributions and benchmarks are welcome.

<div id='migrationguides'/>

### Migration Guides

- **[Migrating from TBB](docs/migrating_from_tbb.md)** — API mappings, thread pool differences, and common porting patterns
- **[Migrating from OpenMP](docs/migrating_from_openmp.md)** — Replacing `#pragma omp` with dispenso equivalents, handling reductions and nested parallelism

<div id='nottouse'/>

## When Not to Use Dispenso
Dispenso isn't really designed for high-latency task offload, it works best for compute-bound tasks.  Using the thread pool for networking, disk, or in cases with frequent TLB misses (really any scenario with kernel context switches) may result in less than ideal performance.

In these kernel context switch scenarios, `dispenso::Future` can be used with `dispenso::NewThreadInvoker`, which should be roughly equivalent with std::future performance.

If you need async I/O, Folly is likely a good choice (though it still doesn't fix e.g. TLB misses).

<div id='examples'/>

## Documentation and Examples
[Documentation can be found here](https://facebookincubator.github.io/dispenso)

Here are some simple examples of what you can do in dispenso. See tests and benchmarks for more examples.

### parallel\_for

A simple sequential loop can be parallelized with minimal changes:

```cpp
for(size_t j = 0; j < kLoops; ++j) {
  vec[j] = someFunction(j);
}
```

Becomes:

```cpp
dispenso::parallel_for(0, kLoops, [&vec] (size_t j) {
  vec[j] = someFunction(j);
});
```

### TaskSet

Schedule multiple tasks and wait for them to complete:

```cpp
void randomWorkConcurrently() {
  dispenso::TaskSet tasks(dispenso::globalThreadPool());
  tasks.schedule([&stateA]() { stateA = doA(); });
  tasks.schedule([]() { doB(); });
  // Do some work on current thread
  tasks.wait(); // After this, A, B done.
  tasks.schedule(doC);
  tasks.schedule([&stateD]() { doD(stateD); });
} // TaskSet's destructor waits for all scheduled tasks to finish
```

### ConcurrentTaskSet

Build a tree in parallel using recursive task scheduling:

```cpp
struct Node {
  int val;
  std::unique_ptr<Node> left, right;
};
void buildTree(dispenso::ConcurrentTaskSet& tasks, std::unique_ptr<Node>& node, int depth) {
  if (depth) {
    node = std::make_unique<Node>();
    node->val = depth;
    tasks.schedule([&tasks, &left = node->left, depth]() { buildTree(tasks, left, depth - 1); });
    tasks.schedule([&tasks, &right = node->right, depth]() { buildTree(tasks, right, depth - 1); });
  }
}
void buildTreeParallel() {
  std::unique_ptr<Node> root;
  dispenso::ConcurrentTaskSet tasks(dispenso::globalThreadPool());
  buildTree(tasks, root, 20);
  tasks.wait();  // tasks would also wait here in destructor if we omitted this line
}
```

### Future

Compose asynchronous operations with futures:

```cpp
dispenso::Future<size_t> ThingProcessor::processThings() {
  auto expensiveFuture = dispenso::async([this]() {
    return processExpensiveThing(expensive_);
  });
  auto futureOfManyCheap = dispenso::async([this]() {
    size_t sum = 0;
    for (auto &thing : cheapThings_) {
      sum += processCheapThing(thing);
    }
    return sum;
  });
  return dispenso::when_all(expensiveFuture, futureOfManyCheap).then([](auto &&tuple) {
    return std::get<0>(tuple).get() + std::get<1>(tuple).get();
  });
}

auto result = thingProc->processThings();
useResult(result.get());
```

### ConcurrentVector

Safely grow a vector from multiple threads:

```cpp
ConcurrentVector<std::unique_ptr<int>> values;
dispenso::parallel_for(
  dispenso::makeChunkedRange(0, length, dispenso::ParForChunking::kStatic),
  [&values](int i, int end) {
    values.grow_by_generator(end - i, [i]() mutable { return std::make_unique<int>(i++); });
  });
```

<div id='benchresults'/>

## Benchmark Results

Dispenso is benchmarked across Linux (x64), macOS (ARM64), Windows (x64), and Android (ARM64),
comparing against OpenMP, TBB, TaskFlow, folly, and `std::async` across thread pools, parallel
loops, futures, graphs, concurrent containers, and more.

**[Interactive Benchmark Dashboard](https://facebookincubator.github.io/dispenso/benchmarks/)** — explore all results
with platform switching, dark/light theme, and detailed per-benchmark charts.

<div id='installing'/>

## Installing
Binary builds of Dispenso are available through several package managers:

- **Conda**: `conda install -c conda-forge dispenso`
- **Conan**: `conan install --requires=dispenso/1.5.0`
- **vcpkg**: `vcpkg install dispenso`
- **Homebrew**: `brew install dispenso`
- **MacPorts**: `sudo port install dispenso`
- **Fedora/RHEL**: `sudo dnf install dispenso-devel`

If your platform is not on the list, see [the next section](#building) for instructions to build from source.

[![Packaging status](https://repology.org/badge/vertical-allrepos/dispenso.svg)](https://repology.org/project/dispenso/versions)

<div id='building'/>

## Building

**Linux and macOS:**
```bash
mkdir build && cd build
cmake PATH_TO_DISPENSO_ROOT
make -j
```

**Windows** (from Developer Command Prompt):
```bash
mkdir build && cd build
cmake PATH_TO_DISPENSO_ROOT
cmake --build . --config Release
```

For detailed instructions including CMake prerequisites, installation, testing, and
benchmarking, see [docs/building.md](docs/building.md).

<div id='knownissues'/>

## Known Issues

* A subset of dispenso tests are known to fail on 32-bit PPC Mac.  If you have access to such a machine and are willing to help debug, it would be appreciated!

## TODO
* Enable Windows benchmarks through CMake. *(may be resolved soon — actively being worked on)*

<div id='license'/>

## License

The library is released under the MIT license, but also relies on the (excellent) moodycamel concurrentqueue library, which is released under the Simplified BSD and Zlib licenses.  See the top of the source at `dispenso/third-party/moodycamel/*.h` for details.


================================================
FILE: benchmarks/CMakeLists.txt
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

cmake_minimum_required(VERSION 3.12)


# Try to find Taskflow: system install, DISPENSO_DEPS_DIR, ~/dispenso_deps, then FetchContent.
find_package(Taskflow QUIET)
if(NOT Taskflow_FOUND)
  # Check well-known local directories for a Taskflow source tree.
  # Searches for both "taskflow" and versioned names like "taskflow-3.11.0".
  if(DEFINED DISPENSO_DEPS_DIR)
    file(GLOB _taskflow_dep_candidates "${DISPENSO_DEPS_DIR}/taskflow*")
    list(APPEND _taskflow_search_dirs ${_taskflow_dep_candidates})
  endif()
  file(GLOB _taskflow_home_candidates "$ENV{HOME}/dispenso_deps/taskflow*")
  list(APPEND _taskflow_search_dirs ${_taskflow_home_candidates})

  set(_taskflow_local_dir "")
  foreach(_dir ${_taskflow_search_dirs})
    if(EXISTS "${_dir}/taskflow/taskflow.hpp")
      set(_taskflow_local_dir "${_dir}")
      break()
    endif()
  endforeach()

  if(_taskflow_local_dir)
    message(STATUS "Found local Taskflow at ${_taskflow_local_dir}")
    add_library(taskflow INTERFACE)
    target_include_directories(taskflow INTERFACE "${_taskflow_local_dir}")
  else()
    message(STATUS "Taskflow not found locally, fetching from GitHub...")
    include(FetchContent)
    message(STATUS "Using up-to-date taskflow")
    FetchContent_Declare(
      taskflow
      GIT_REPOSITORY https://github.com/taskflow/taskflow.git
      GIT_TAG        v3.6.0
      CONFIGURE_COMMAND ""
      BUILD_COMMAND ""
    )
    FetchContent_GetProperties(taskflow)
    if(NOT taskflow_POPULATED)
      FetchContent_Populate(taskflow)
    endif()
    FetchContent_MakeAvailable(taskflow)
    add_library(taskflow INTERFACE)
    target_include_directories(taskflow INTERFACE ${taskflow_SOURCE_DIR})
  endif()
else()
  message(STATUS "Found system Taskflow")
  if(NOT TARGET taskflow)
    add_library(taskflow INTERFACE)
    target_link_libraries(taskflow INTERFACE Taskflow::Taskflow)
  endif()
endif()

find_package(benchmark QUIET)
if(NOT benchmark_FOUND)
  message(STATUS "Google Benchmark not found locally, fetching from GitHub...")
  include(FetchContent)
  FetchContent_Declare(
    benchmark
    GIT_REPOSITORY https://github.com/google/benchmark.git
    GIT_TAG        v1.8.3
  )
  set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "Disable benchmark tests")
  set(BENCHMARK_ENABLE_GTEST_TESTS OFF CACHE BOOL "Disable gtest tests")
  FetchContent_MakeAvailable(benchmark)
else()
  message(STATUS "Found system Google Benchmark")
endif()
# OpenMP support (including Windows/MSVC)
find_package(OpenMP)

# TBB support - prefer CONFIG mode to find TBBConfig.cmake from vcpkg/modern installs
find_package(TBB CONFIG QUIET)
if(NOT TBB_FOUND)
  # Fall back to old FindTBB.cmake module
  find_package(TBB QUIET)
endif()

find_package(folly)

if (WIN32)
  set (REQUIRED_LIBS dispenso benchmark::benchmark benchmark::benchmark_main taskflow)
else (WIN32)
  set (REQUIRED_LIBS dispenso benchmark::benchmark benchmark::benchmark_main pthread taskflow)
endif (WIN32)

if (TBB_FOUND)
  # Handle both old TBB (lowercase 'tbb' target) and new oneTBB (TBB::tbb target)
  if(TARGET TBB::tbb)
    set (OPTIONAL_LIBS ${OPTIONAL_LIBS} TBB::tbb)
  elseif(TARGET tbb)
    set (OPTIONAL_LIBS ${OPTIONAL_LIBS} tbb)
  else()
    set (OPTIONAL_LIBS ${OPTIONAL_LIBS} TBB::tbb)
  endif()
else (TBB_FOUND)
  add_compile_definitions(BENCHMARK_WITHOUT_TBB)
endif (TBB_FOUND)

if (OpenMP_CXX_FOUND)
  set (OPTIONAL_LIBS ${OPTIONAL_LIBS} OpenMP::OpenMP_CXX)
endif (OpenMP_CXX_FOUND)

if (FOLLY_LIBRARIES AND NOT ${BENCHMARK_WITHOUT_FOLLY})
  find_package(gflags)
  set (OPTIONAL_LIBS ${OPTIONAL_LIBS} ${FOLLY_LIBRARIES})
else (FOLLY_LIBRARIES AND NOT ${BENCHMARK_WITHOUT_FOLLY})
  add_compile_definitions(BENCHMARK_WITHOUT_FOLLY)
endif (FOLLY_LIBRARIES AND NOT ${BENCHMARK_WITHOUT_FOLLY})

file(GLOB BENCHMARK_FILES CONFIGURE_DEPENDS ${PROJECT_SOURCE_DIR}/benchmarks/*.cpp)

foreach(BENCHMARK_FILE ${BENCHMARK_FILES})
  set(BENCHMARK_NAME)
  get_filename_component(BENCHMARK_NAME ${BENCHMARK_FILE} NAME_WE)
  add_executable(${BENCHMARK_NAME} ${BENCHMARK_FILE})
  target_compile_features(${BENCHMARK_NAME} PRIVATE cxx_std_20)
  target_link_libraries(${BENCHMARK_NAME} ${REQUIRED_LIBS} ${OPTIONAL_LIBS})
endforeach()

if(DISPENSO_BUILD_FAST_MATH)
  add_subdirectory(fast_math)
endif()


================================================
FILE: benchmarks/benchmark_common.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * This source code is licensed under the MIT license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <benchmark/benchmark.h>

#if defined(__GNUC__) || defined(__clang__)
#define UNUSED_VAR myLocalForLoopVar __attribute__((unused))
#elif defined(_MSC_VER)
#define UNUSED_VAR myLocalForLoopVar __pragma(warning(suppress : 4100))
#else
#define UNUSED_VAR myLocalForLoopVar
#endif


================================================
FILE: benchmarks/cascading_parallel_for_benchmark.cpp
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * This source code is licensed under the MIT license found in the
 * LICENSE file in the root directory of this source tree.
 */

/**
 * Benchmark demonstrating dispenso's cascading parallel_for advantage.
 *
 * When multiple independent parallel_for loops need to run, dispenso can
 * overlap them on a shared TaskSet using ParForOptions{.wait = false}.
 * TBB and OpenMP each impose an implicit barrier per parallel_for call,
 * forcing sequential execution of independent loops.
 */

#include <dispenso/parallel_for.h>

#if defined(_OPENMP)
#include <omp.h>
#endif

#include <array>
#include <unordered_map>

#if !defined(BENCHMARK_WITHOUT_TBB)
#include "tbb/blocked_range.h"
#include "tbb/parallel_for.h"
#include "tbb/task_group.h"
#include "tbb_compat.h"
#endif // !BENCHMARK_WITHOUT_TBB

#include "thread_benchmark_common.h"

static constexpr int32_t kSmallSize = 1000;
static constexpr int32_t kMediumSize = 100000;
static constexpr int32_t kLargeSize = 10000000;

static constexpr int32_t kNumLoops = 8;

// Minimum work per chunk to amortize scheduling overhead for cheap lambdas.
// With trivial compute (~4 integer ops ≈ 2ns/element), 512 elements ≈ 1µs
// of work, comfortably covering task dispatch cost on Windows.
static constexpr uint32_t kMinItemsPerChunk = 512;

static uint32_t kSeed(42);

inline int32_t compute(int32_t x) {
  return x * x - 3 * x + 7;
}

inline int32_t fuse(const std::array<int32_t, kNumLoops>& values) {
  int32_t result = 0;
  for (int32_t k = 0; k < kNumLoops; ++k) {
    result += values[static_cast<size_t>(k)];
  }
  return result;
}

struct BenchArrays {
  std::array<std::vector<int32_t>, kNumLoops> inputs;
  std::array<std::vector<int32_t>, kNumLoops> outputs;
  std::vector<int32_t> result;
};

BenchArrays& getArrays(int32_t numElements) {
  static std::unordered_map<int32_t, BenchArrays> arrays;
  auto it = arrays.find(numElements);
  if (it != arrays.end()) {
    return it->second;
  }
  srand(kSeed);
  BenchArrays ba;
  for (int32_t k = 0; k < kNumLoops; ++k) {
    ba.inputs[static_cast<size_t>(k)].reserve(static_cast<size_t>(numElements));
    for (int32_t i = 0; i < numElements; ++i) {
      ba.inputs[static_cast<size_t>(k)].push_back((rand() & 255) - 127);
    }
    ba.outputs[static_cast<size_t>(k)].resize(static_cast<size_t>(numElements), 0);
  }
  ba.result.resize(static_cast<size_t>(numElements), 0);
  auto res = arrays.emplace(numElements, std::move(ba));
  assert(res.second);
  return res.first->second;
}

void checkResults(BenchArrays& ba, int32_t numElements) {
  for (int32_t i = 0; i < numElements; ++i) {
    auto idx = static_cast<size_t>(i);
    std::array<int32_t, kNumLoops> expected;
    for (int32_t k = 0; k < kNumLoops; ++k) {
      expected[static_cast<size_t>(k)] = compute(ba.inputs[static_cast<size_t>(k)][idx]);
    }
    int32_t expectedFused = fuse(expected);
    if (ba.result[idx] != expectedFused) {
      std::cerr << "FAIL at index " << i << ": got " << ba.result[idx] << " expected "
                << expectedFused << std::endl;
      abort();
    }
  }
}

void BM_serial(benchmark::State& state) {
  const int32_t numElements = state.range(0);
  auto& ba = getArrays(numElements);

  for (auto UNUSED_VAR : state) {
    for (int32_t k = 0; k < kNumLoops; ++k) {
      auto kk = static_cast<size_t>(k);
      for (int32_t i = 0; i < numElements; ++i) {
        ba.outputs[kk][static_cast<size_t>(i)] = compute(ba.inputs[kk][static_cast<size_t>(i)]);
      }
    }
    for (int32_t i = 0; i < numElements; ++i) {
      auto idx = static_cast<size_t>(i);
      std::array<int32_t, kNumLoops> vals;
      for (int32_t k = 0; k < kNumLoops; ++k) {
        vals[static_cast<size_t>(k)] = ba.outputs[static_cast<size_t>(k)][idx];
      }
      ba.result[idx] = fuse(vals);
    }
  }
  checkResults(ba, numElements);
}

void BM_dispenso_blocking(benchmark::State& state) {
  const int32_t numThreads = state.range(0) - 1;
  const int32_t numElements = state.range(1);
  auto& ba = getArrays(numElements);

  dispenso::ThreadPool pool(numThreads);
  dispenso::ParForOptions opts;
  opts.minItemsPerChunk = kMinItemsPerChunk;

  for (auto UNUSED_VAR : state) {
    dispenso::TaskSet tasks(pool);
    for (int32_t k = 0; k < kNumLoops; ++k) {
      auto kk = static_cast<size_t>(k);
      dispenso::parallel_for(
          tasks,
          0,
          numElements,
          [&inputs = ba.inputs[kk], &outputs = ba.outputs[kk]](int32_t i) {
            outputs[static_cast<size_t>(i)] = compute(inputs[static_cast<size_t>(i)]);
          },
          opts);
    }
    dispenso::parallel_for(
        tasks,
        0,
        numElements,
        [&ba](int32_t i) {
          auto idx = static_cast<size_t>(i);
          std::array<int32_t, kNumLoops> vals;
          for (int32_t k = 0; k < kNumLoops; ++k) {
            vals[static_cast<size_t>(k)] = ba.outputs[static_cast<size_t>(k)][idx];
          }
          ba.result[idx] = fuse(vals);
        },
        opts);
  }
  checkResults(ba, numElements);
}

void BM_dispenso_cascaded(benchmark::State& state) {
  const int32_t numThreads = state.range(0) - 1;
  const int32_t numElements = state.range(1);
  auto& ba = getArrays(numElements);

  dispenso::ThreadPool pool(numThreads);
  dispenso::ParForOptions noWait;
  noWait.wait = false;
  noWait.minItemsPerChunk = kMinItemsPerChunk;
  dispenso::ParForOptions opts;
  opts.minItemsPerChunk = kMinItemsPerChunk;

  for (auto UNUSED_VAR : state) {
    dispenso::TaskSet tasks(pool);
    // First N-1 loops: non-blocking, returns immediately
    for (int32_t k = 0; k < kNumLoops - 1; ++k) {
      auto kk = static_cast<size_t>(k);
      dispenso::parallel_for(
          tasks,
          0,
          numElements,
          [&inputs = ba.inputs[kk], &outputs = ba.outputs[kk]](int32_t i) {
            outputs[static_cast<size_t>(i)] = compute(inputs[static_cast<size_t>(i)]);
          },
          noWait);
    }
    // Last independent loop: blocking — calling thread participates,
    // implicitly waits for all prior non-blocking loops too
    {
      constexpr auto kk = static_cast<size_t>(kNumLoops - 1);
      dispenso::parallel_for(
          tasks,
          0,
          numElements,
          [&inputs = ba.inputs[kk], &outputs = ba.outputs[kk]](int32_t i) {
            outputs[static_cast<size_t>(i)] = compute(inputs[static_cast<size_t>(i)]);
          },
          opts);
    }
    // Fusion: blocking (depends on all outputs being complete)
    dispenso::parallel_for(
        tasks,
        0,
        numElements,
        [&ba](int32_t i) {
          auto idx = static_cast<size_t>(i);
          std::array<int32_t, kNumLoops> vals;
          for (int32_t k = 0; k < kNumLoops; ++k) {
            vals[static_cast<size_t>(k)] = ba.outputs[static_cast<size_t>(k)][idx];
          }
          ba.result[idx] = fuse(vals);
        },
        opts);
  }
  checkResults(ba, numElements);
}

#if defined(_OPENMP)
void BM_omp(benchmark::State& state) {
  const int32_t numThreads = state.range(0);
  const int32_t numElements = state.range(1);
  auto& ba = getArrays(numElements);

  omp_set_num_threads(numThreads);

  for (auto UNUSED_VAR : state) {
    for (int32_t k = 0; k < kNumLoops; ++k) {
      auto kk = static_cast<size_t>(k);
#pragma omp parallel for
      for (int32_t i = 0; i < numElements; ++i) {
        ba.outputs[kk][static_cast<size_t>(i)] = compute(ba.inputs[kk][static_cast<size_t>(i)]);
      }
    }
#pragma omp parallel for
    for (int32_t i = 0; i < numElements; ++i) {
      auto idx = static_cast<size_t>(i);
      std::array<int32_t, kNumLoops> vals;
      for (int32_t k = 0; k < kNumLoops; ++k) {
        vals[static_cast<size_t>(k)] = ba.outputs[static_cast<size_t>(k)][idx];
      }
      ba.result[idx] = fuse(vals);
    }
  }
  checkResults(ba, numElements);
}
#endif /*defined(_OPENMP)*/

#if !defined(BENCHMARK_WITHOUT_TBB)
void BM_tbb(benchmark::State& state) {
  const int32_t numThreads = state.range(0);
  const int32_t numElements = state.range(1);
  auto& ba = getArrays(numElements);

  for (auto UNUSED_VAR : state) {
    tbb_compat::task_scheduler_init initsched(numThreads);

    for (int32_t k = 0; k < kNumLoops; ++k) {
      auto kk = static_cast<size_t>(k);
      tbb::parallel_for(
          tbb::blocked_range<int32_t>(0, numElements),
          [&inputs = ba.inputs[kk],
           &outputs = ba.outputs[kk]](const tbb::blocked_range<int32_t>& r) {
            for (int32_t i = r.begin(); i != r.end(); ++i) {
              outputs[static_cast<size_t>(i)] = compute(inputs[static_cast<size_t>(i)]);
            }
          });
    }
    tbb::parallel_for(
        tbb::blocked_range<int32_t>(0, numElements), [&ba](const tbb::blocked_range<int32_t>& r) {
          for (int32_t i = r.begin(); i != r.end(); ++i) {
            auto idx = static_cast<size_t>(i);
            std::array<int32_t, kNumLoops> vals;
            for (int32_t k = 0; k < kNumLoops; ++k) {
              vals[static_cast<size_t>(k)] = ba.outputs[static_cast<size_t>(k)][idx];
            }
            ba.result[idx] = fuse(vals);
          }
        });
  }
  checkResults(ba, numElements);
}

// TBB with task_group: launch all independent parallel_for loops concurrently
// via task_group, then wait — emulating dispenso's cascading behavior.
void BM_tbb_task_group(benchmark::State& state) {
  const int32_t numThreads = state.range(0);
  const int32_t numElements = state.range(1);
  auto& ba = getArrays(numElements);

  for (auto UNUSED_VAR : state) {
    tbb_compat::task_scheduler_init initsched(numThreads);
    tbb::task_group tg;

    for (int32_t k = 0; k < kNumLoops; ++k) {
      auto kk = static_cast<size_t>(k);
      tg.run([&inputs = ba.inputs[kk], &outputs = ba.outputs[kk], numElements]() {
        tbb::parallel_for(
            tbb::blocked_range<int32_t>(0, numElements),
            [&inputs, &outputs](const tbb::blocked_range<int32_t>& r) {
              for (int32_t i = r.begin(); i != r.end(); ++i) {
                outputs[static_cast<size_t>(i)] = compute(inputs[static_cast<size_t>(i)]);
              }
            });
      });
    }
    tg.wait();

    tbb::parallel_for(
        tbb::blocked_range<int32_t>(0, numElements), [&ba](const tbb::blocked_range<int32_t>& r) {
          for (int32_t i = r.begin(); i != r.end(); ++i) {
            auto idx = static_cast<size_t>(i);
            std::array<int32_t, kNumLoops> vals;
            for (int32_t k = 0; k < kNumLoops; ++k) {
              vals[static_cast<size_t>(k)] = ba.outputs[static_cast<size_t>(k)][idx];
            }
            ba.result[idx] = fuse(vals);
          }
        });
  }
  checkResults(ba, numElements);
}
#endif // !BENCHMARK_WITHOUT_TBB

static void CustomArguments(benchmark::internal::Benchmark* b) {
  for (int j : {kSmallSize, kMediumSize, kLargeSize}) {
    for (int i : pow2HalfStepThreads()) {
      b->Args({i, j});
    }
  }
}

BENCHMARK(BM_serial)->Args({kSmallSize})->Args({kMediumSize})->Args({kLargeSize})->UseRealTime();

#if defined(_OPENMP)
BENCHMARK(BM_omp)->Apply(CustomArguments)->UseRealTime();
#endif // OPENMP
#if !defined(BENCHMARK_WITHOUT_TBB)
BENCHMARK(BM_tbb)->Apply(CustomArguments)->UseRealTime();
BENCHMARK(BM_tbb_task_group)->Apply(CustomArguments)->UseRealTime();
#endif // !BENCHMARK_WITHOUT_TBB

BENCHMARK(BM_dispenso_blocking)->Apply(CustomArguments)->UseRealTime();
BENCHMARK(BM_dispenso_cascaded)->Apply(CustomArguments)->UseRealTime();

BENCHMARK_MAIN();


================================================
FILE: benchmarks/concurrent_vector_benchmark.cpp
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * This source code is licensed under the MIT license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <deque>
#include <iostream>
#include <random>
#include <vector>

#include <benchmark/benchmark.h>

#if !defined(BENCHMARK_WITHOUT_TBB)
#include "tbb/concurrent_vector.h"
#endif // !BENCHMARK_WITHOUT_TBB

#include <dispenso/concurrent_vector.h>
#include <dispenso/parallel_for.h>

#include "thread_benchmark_common.h"

constexpr size_t kLength = (1 << 20);

void checkIotaSum(int64_t sum) {
  if (sum != (static_cast<int64_t>(kLength - 1) * kLength) / 2) {
    std::cout << sum << " vs " << ((kLength - 1) * kLength) / 2 << std::endl;

    std::abort();
  }
}

template <typename Cont>
void checkIotaSum(const Cont& c, int64_t sum) {
  if (sum != (static_cast<int64_t>(kLength - 1) * kLength) / 2) {
    std::cout << sum << " vs " << ((kLength - 1) * kLength) / 2 << std::endl;

    std::vector<uint8_t> accountedFor(kLength);
    for (auto v : c) {
      accountedFor[v] = 1;
    }

    for (size_t i = 0; i < kLength; ++i) {
      if (!accountedFor[i]) {
        std::cout << "missing " << i << std::endl;
      }
    }

    std::abort();
  }
}

template <typename ContainerInit>
void pushBackImpl(benchmark::State& state, ContainerInit containerInit) {
  for (auto UNUSED_VAR : state) {
    auto values = containerInit();
    for (size_t i = 0; i < kLength; ++i) {
      values.push_back(i);
    }
  }
}

#if !defined(BENCHMARK_WITHOUT_TBB)
template <typename ContainerInit>
void pushBackGrowByAlternativeTbb(benchmark::State& state, ContainerInit containerInit) {
  for (auto UNUSED_VAR : state) {
    auto values = containerInit();
    auto it = values.grow_by(kLength);
    auto end = values.end();
    size_t i = 0;
    for (; it != end; ++it) {
      *it = i++;
    }
  }
}
#endif // !BENCHMARK_WITHOUT_TBB

template <typename ContainerInit>
void pushBackGrowByAlternativeDispenso(benchmark::State& state, ContainerInit containerInit) {
  for (auto UNUSED_VAR : state) {
    auto values = containerInit();
    values.grow_by_generator(kLength, [i = size_t{0}]() mutable { return i++; });
  }
}

void BM_std_push_back_serial(benchmark::State& state) {
  pushBackImpl(state, []() { return std::vector<int>(); });
}

void BM_deque_push_back_serial(benchmark::State& state) {
  pushBackImpl(state, []() { return std::deque<int>(); });
}

#if !defined(BENCHMARK_WITHOUT_TBB)
void BM_tbb_push_back_serial(benchmark::State& state) {
  pushBackImpl(state, []() { return tbb::concurrent_vector<int>(); });
}
#endif // !BENCHMARK_WITHOUT_TBB

void BM_dispenso_push_back_serial(benchmark::State& state) {
  pushBackImpl(state, []() { return dispenso::ConcurrentVector<int>(); });
}

#if !defined(BENCHMARK_WITHOUT_TBB)
void BM_tbb_push_back_serial_grow_by_alternative(benchmark::State& state) {
  pushBackGrowByAlternativeTbb(state, []() { return tbb::concurrent_vector<int>(); });
}
#endif // !BENCHMARK_WITHOUT_TBB

void BM_dispenso_push_back_serial_grow_by_alternative(benchmark::State& state) {
  pushBackGrowByAlternativeDispenso(state, []() { return dispenso::ConcurrentVector<int>(); });
}

void BM_std_push_back_serial_reserve(benchmark::State& state) {
  pushBackImpl(state, []() {
    std::vector<int> v;
    v.reserve(kLength);
    return v;
  });
}

#if !defined(BENCHMARK_WITHOUT_TBB)
void BM_tbb_push_back_serial_reserve(benchmark::State& state) {
  pushBackImpl(state, []() {
    tbb::concurrent_vector<int> v;
    v.reserve(kLength);
    return v;
  });
}
#endif // !BENCHMARK_WITHOUT_TBB

void BM_dispenso_push_back_serial_reserve(benchmark::State& state) {
  pushBackImpl(
      state, []() { return dispenso::ConcurrentVector<int>(kLength, dispenso::ReserveTag); });
}

#if !defined(BENCHMARK_WITHOUT_TBB)
void BM_tbb_push_back_serial_grow_by_alternative_reserve(benchmark::State& state) {
  pushBackGrowByAlternativeTbb(state, []() {
    tbb::concurrent_vector<int> v;
    v.reserve(kLength);
    return v;
  });
}
#endif // !BENCHMARK_WITHOUT_TBB

void BM_dispenso_push_back_serial_grow_by_alternative_reserve(benchmark::State& state) {
  pushBackGrowByAlternativeDispenso(
      state, []() { return dispenso::ConcurrentVector<int>(kLength, dispenso::ReserveTag); });
}

template <typename ContainerInit>
void iterateImpl(benchmark::State& state, ContainerInit containerInit) {
  auto values = containerInit();
  for (size_t i = 0; i < kLength; ++i) {
    values.push_back(i);
  }
  int64_t sum;
  for (auto UNUSED_VAR : state) {
    sum = 0;
    for (auto i : values) {
      sum += i;
    }
  }

  checkIotaSum(sum);
}

void BM_std_iterate(benchmark::State& state) {
  iterateImpl(state, []() { return std::vector<int>(); });
}

void BM_deque_iterate(benchmark::State& state) {
  iterateImpl(state, []() { return std::deque<int>(); });
}

#if !defined(BENCHMARK_WITHOUT_TBB)
void BM_tbb_iterate(benchmark::State& state) {
  iterateImpl(state, []() { return tbb::concurrent_vector<int>(); });
}
#endif // !BENCHMARK_WITHOUT_TBB

void BM_dispenso_iterate(benchmark::State& state) {
  iterateImpl(state, []() { return dispenso::ConcurrentVector<int>(); });
}

template <typename T>
struct ReverseWrapper {
  T& iterable;
};

template <typename T>
auto begin(ReverseWrapper<T> w) {
  return std::rbegin(w.iterable);
}

template <typename T>
auto end(ReverseWrapper<T> w) {
  return std::rend(w.iterable);
}

template <typename T>
ReverseWrapper<T> reverse(T&& iterable) {
  return {iterable};
}

template <typename ContainerInit>
void iterateReverseImpl(benchmark::State& state, ContainerInit containerInit) {
  auto values = containerInit();
  for (size_t i = 0; i < kLength; ++i) {
    values.push_back(i);
  }
  int64_t sum;
  for (auto UNUSED_VAR : state) {
    sum = 0;
    for (auto i : reverse(values)) {
      sum += i;
    }
  }

  checkIotaSum(sum);
}

void BM_std_iterate_reverse(benchmark::State& state) {
  iterateReverseImpl(state, []() { return std::vector<int>(); });
}

void BM_deque_iterate_reverse(benchmark::State& state) {
  iterateReverseImpl(state, []() { return std::deque<int>(); });
}

#if !defined(BENCHMARK_WITHOUT_TBB)
void BM_tbb_iterate_reverse(benchmark::State& state) {
  iterateReverseImpl(state, []() { return tbb::concurrent_vector<int>(); });
}
#endif // !BENCHMARK_WITHOUT_TBB

void BM_dispenso_iterate_reverse(benchmark::State& state) {
  iterateReverseImpl(state, []() { return dispenso::ConcurrentVector<int>(); });
}

template <typename ContainerInit>
void lowerBoundImpl(benchmark::State& state, ContainerInit containerInit) {
  auto values = containerInit();
  for (size_t i = 0; i < kLength; ++i) {
    values.push_back(i);
  }
  int64_t sum;
  for (auto UNUSED_VAR : state) {
    sum = 0;
    for (size_t i = 0; i < kLength; ++i) {
      sum += std::lower_bound(std::begin(values), std::end(values), i) - std::begin(values);
    }
  }

  checkIotaSum(sum);
}

void BM_std_lower_bound(benchmark::State& state) {
  lowerBoundImpl(state, []() { return std::vector<int>(); });
}

void BM_deque_lower_bound(benchmark::State& state) {
  lowerBoundImpl(state, []() { return std::deque<int>(); });
}

#if !defined(BENCHMARK_WITHOUT_TBB)
void BM_tbb_lower_bound(benchmark::State& state) {
  lowerBoundImpl(state, []() { return tbb::concurrent_vector<int>(); });
}
#endif // !BENCHMARK_WITHOUT_TBB

void BM_dispenso_lower_bound(benchmark::State& state) {
  lowerBoundImpl(state, []() { return dispenso::ConcurrentVector<int>(); });
}

template <typename ContainerInit>
void indexImpl(benchmark::State& state, ContainerInit containerInit) {
  auto values = containerInit();
  for (size_t i = 0; i < kLength; ++i) {
    values.push_back(i);
  }
  int64_t sum;
  for (auto UNUSED_VAR : state) {
    sum = 0;
    size_t len = values.size();
    for (size_t i = 0; i < len; ++i) {
      sum += values[i];
    }
  }

  checkIotaSum(sum);
}

void BM_std_index(benchmark::State& state) {
  indexImpl(state, []() { return std::vector<int>(); });
}

void BM_deque_index(benchmark::State& state) {
  indexImpl(state, []() { return std::deque<int>(); });
}

#if !defined(BENCHMARK_WITHOUT_TBB)
void BM_tbb_index(benchmark::State& state) {
  indexImpl(state, []() { return tbb::concurrent_vector<int>(); });
}
#endif // !BENCHMARK_WITHOUT_TBB

void BM_dispenso_index(benchmark::State& state) {
  indexImpl(state, []() { return dispenso::ConcurrentVector<int>(); });
}

template <typename ContainerInit>
void randomImpl(benchmark::State& state, ContainerInit containerInit) {
  auto values = containerInit();
  std::vector<size_t> indices;
  for (size_t i = 0; i < kLength; ++i) {
    values.push_back(i);
    indices.push_back(i);
  }

  // Make this repeatable.
  std::mt19937 rng(27);
  std::shuffle(indices.begin(), indices.end(), rng);

  int64_t sum;
  for (auto UNUSED_VAR : state) {
    sum = 0;
    for (auto i : indices) {
      sum += values[i];
    }
  }

  checkIotaSum(sum);
}

void BM_std_random(benchmark::State& state) {
  randomImpl(state, []() { return std::vector<int>(); });
}

void BM_deque_random(benchmark::State& state) {
  randomImpl(state, []() { return std::deque<int>(); });
}

#if !defined(BENCHMARK_WITHOUT_TBB)
void BM_tbb_random(benchmark::State& state) {
  randomImpl(state, []() { return tbb::concurrent_vector<int>(); });
}
#endif // !BENCHMARK_WITHOUT_TBB

void BM_dispenso_random(benchmark::State& state) {
  randomImpl(state, []() { return dispenso::ConcurrentVector<int>(); });
}

template <typename ContainerInit, typename ContainerPush>
void parallelImpl(
    benchmark::State& state,
    ContainerInit containerInit,
    ContainerPush containerPush) {
  for (auto UNUSED_VAR : state) {
    auto values = containerInit();
    dispenso::parallel_for(
        0, kLength, [&values, containerPush](size_t i) { containerPush(values, i); });
  }
}

void BM_std_parallel(benchmark::State& state) {
  std::mutex mtx;
  parallelImpl(
      state,
      []() { return std::vector<int>(); },
      [&mtx](std::vector<int>& c, int i) {
        std::lock_guard<std::mutex> lk(mtx);
        c.push_back(i);
      });
}

void BM_deque_parallel(benchmark::State& state) {
  std::mutex mtx;
  parallelImpl(
      state,
      []() { return std::deque<int>(); },
      [&mtx](std::deque<int>& c, int i) {
        std::lock_guard<std::mutex> lk(mtx);
        c.push_back(i);
      });
}

#if !defined(BENCHMARK_WITHOUT_TBB)
void BM_tbb_parallel(benchmark::State& state) {
  parallelImpl(
      state,
      []() { return tbb::concurrent_vector<int>(); },
      [](tbb::concurrent_vector<int>& c, int i) { c.push_back(i); });
}
#endif // !BENCHMARK_WITHOUT_TBB

void BM_dispenso_parallel(benchmark::State& state) {
  parallelImpl(
      state,
      []() { return dispenso::ConcurrentVector<int>(); },
      [](dispenso::ConcurrentVector<int>& c, int i) { c.push_back(i); });
}

void BM_std_parallel_reserve(benchmark::State& state) {
  std::mutex mtx;
  parallelImpl(
      state,
      []() {
        std::vector<int> v;
        v.reserve(kLength);
        return v;
      },
      [&mtx](std::vector<int>& c, int i) {
        std::lock_guard<std::mutex> lk(mtx);
        c.push_back(i);
      });
}

#if !defined(BENCHMARK_WITHOUT_TBB)
void BM_tbb_parallel_reserve(benchmark::State& state) {
  parallelImpl(
      state,
      []() {
        tbb::concurrent_vector<int> v;
        v.reserve(kLength);
        return v;
      },
      [](tbb::concurrent_vector<int>& c, int i) { c.push_back(i); });
}
#endif // !BENCHMARK_WITHOUT_TBB

void BM_dispenso_parallel_reserve(benchmark::State& state) {
  parallelImpl(
      state,
      []() { return dispenso::ConcurrentVector<int>(kLength, dispenso::ReserveTag); },
      [](dispenso::ConcurrentVector<int>& c, int i) { c.push_back(i); });
}

template <typename ContainerInit, typename ContainerPush>
void parallelImplClear(
    benchmark::State& state,
    ContainerInit containerInit,
    ContainerPush containerPush) {
  auto values = containerInit();

  for (auto UNUSED_VAR : state) {
    values.clear();
    dispenso::parallel_for(
        0, kLength, [&values, containerPush](size_t i) { containerPush(values, i); });
  }

  int64_t sum = 0;

  for (auto i : values) {
    sum += i;
  }

  checkIotaSum(sum);
}

void BM_std_parallel_clear(benchmark::State& state) {
  std::mutex mtx;
  parallelImplClear(
      state,
      []() { return std::vector<int>(); },
      [&mtx](std::vector<int>& c, int i) {
        std::lock_guard<std::mutex> lk(mtx);
        c.push_back(i);
      });
}

void BM_deque_parallel_clear(benchmark::State& state) {
  std::mutex mtx;
  parallelImplClear(
      state,
      []() { return std::deque<int>(); },
      [&mtx](std::deque<int>& c, int i) {
        std::lock_guard<std::mutex> lk(mtx);
        c.push_back(i);
      });
}

#if !defined(BENCHMARK_WITHOUT_TBB)
void BM_tbb_parallel_clear(benchmark::State& state) {
  parallelImplClear(
      state,
      []() { return tbb::concurrent_vector<int>(); },
      [](tbb::concurrent_vector<int>& c, int i) { c.push_back(i); });
}
#endif // !BENCHMARK_WITHOUT_TBB

void BM_dispenso_parallel_clear(benchmark::State& state) {
  parallelImplClear(
      state,
      []() { return dispenso::ConcurrentVector<int>(); },
      [](dispenso::ConcurrentVector<int>& c, int i) { c.push_back(i); });
}

template <typename ContainerInit, typename ContainerPush>
void parallelImplGrowBy(
    size_t growBy,
    benchmark::State& state,
    ContainerInit containerInit,
    ContainerPush containerPush) {
  auto values = containerInit();

  for (auto UNUSED_VAR : state) {
    values.clear();
    dispenso::parallel_for(
        dispenso::makeChunkedRange(0, kLength, dispenso::ParForChunking::kStatic),
        [&values, containerPush, growBy](size_t i, size_t end) {
          while (i + growBy <= end) {
            containerPush(values, i, i + growBy);
            i += growBy;
          }
          containerPush(values, i, end);
        });
  }

  int64_t sum = 0;

  for (auto i : values) {
    sum += i;
  }

  checkIotaSum(values, sum);
}

void BM_std_parallel_grow_by_10(benchmark::State& state) {
  std::mutex mtx;
  parallelImplGrowBy(
      10,
      state,
      []() { return std::vector<int>(); },
      [&mtx](std::vector<int>& c, int i, int end) {
        std::lock_guard<std::mutex> lk(mtx);
        for (; i != end; ++i) {
          c.push_back(i);
        }
      });
}

void BM_deque_parallel_grow_by_10(benchmark::State& state) {
  std::mutex mtx;
  parallelImplGrowBy(
      10,
      state,
      []() { return std::deque<int>(); },
      [&mtx](std::deque<int>& c, int i, int end) {
        std::lock_guard<std::mutex> lk(mtx);
        for (; i != end; ++i) {
          c.push_back(i);
        }
      });
}

#if !defined(BENCHMARK_WITHOUT_TBB)
void BM_tbb_parallel_grow_by_10(benchmark::State& state) {
  parallelImplGrowBy(
      10,
      state,
      []() { return tbb::concurrent_vector<int>(); },
      [](tbb::concurrent_vector<int>& c, int i, int end) {
        auto it = c.grow_by(end - i);
        for (; i != end; ++i, ++it) {
          *it = i;
        }
      });
}
#endif // !BENCHMARK_WITHOUT_TBB

void BM_dispenso_parallel_grow_by_10(benchmark::State& state) {
  parallelImplGrowBy(
      10,
      state,
      []() { return dispenso::ConcurrentVector<int>(); },
      [](dispenso::ConcurrentVector<int>& c, int i, int end) {
        c.grow_by_generator(end - i, [i]() mutable { return i++; });
      });
}

void BM_std_parallel_grow_by_100(benchmark::State& state) {
  std::mutex mtx;
  parallelImplGrowBy(
      100,
      state,
      []() { return std::vector<int>(); },
      [&mtx](std::vector<int>& c, int i, int end) {
        std::lock_guard<std::mutex> lk(mtx);
        for (; i != end; ++i) {
          c.push_back(i);
        }
      });
}

void BM_deque_parallel_grow_by_100(benchmark::State& state) {
  std::mutex mtx;
  parallelImplGrowBy(
      100,
      state,
      []() { return std::deque<int>(); },
      [&mtx](std::deque<int>& c, int i, int end) {
        std::lock_guard<std::mutex> lk(mtx);
        for (; i != end; ++i) {
          c.push_back(i);
        }
      });
}

#if !defined(BENCHMARK_WITHOUT_TBB)
void BM_tbb_parallel_grow_by_100(benchmark::State& state) {
  parallelImplGrowBy(
      100,
      state,
      []() { return tbb::concurrent_vector<int>(); },
      [](tbb::concurrent_vector<int>& c, int i, int end) {
        auto it = c.grow_by(end - i);
        for (; i != end; ++i, ++it) {
          *it = i;
        }
      });
}
#endif // !BENCHMARK_WITHOUT_TBB

void BM_dispenso_parallel_grow_by_100(benchmark::State& state) {
  parallelImplGrowBy(
      100,
      state,
      []() { return dispenso::ConcurrentVector<int>(); },
      [](dispenso::ConcurrentVector<int>& c, int i, int end) {
        c.grow_by_generator(end - i, [i]() mutable { return i++; });
      });
}

template <typename ContainerInit, typename ContainerPush>
void parallelImplGrowByMax(
    benchmark::State& state,
    ContainerInit containerInit,
    ContainerPush containerPush) {
  auto values = containerInit();

  for (auto UNUSED_VAR : state) {
    values.clear();
    dispenso::parallel_for(
        dispenso::makeChunkedRange(0, kLength, dispenso::ParForChunking::kStatic),
        [&values, containerPush](size_t i, size_t end) { containerPush(values, i, end); });
  }

  int64_t sum = 0;

  for (auto i : values) {
    sum += i;
  }

  checkIotaSum(sum);
}

void BM_std_parallel_grow_by_max(benchmark::State& state) {
  std::mutex mtx;
  parallelImplGrowByMax(
      state,
      []() { return std::vector<int>(); },
      [&mtx](std::vector<int>& c, int i, int end) {
        std::lock_guard<std::mutex> lk(mtx);
        for (; i != end; ++i) {
          c.push_back(i);
        }
      });
}

void BM_deque_parallel_grow_by_max(benchmark::State& state) {
  std::mutex mtx;
  parallelImplGrowByMax(
      state,
      []() { return std::deque<int>(); },
      [&mtx](std::deque<int>& c, int i, int end) {
        std::lock_guard<std::mutex> lk(mtx);
        for (; i != end; ++i) {
          c.push_back(i);
        }
      });
}

#if !defined(BENCHMARK_WITHOUT_TBB)
void BM_tbb_parallel_grow_by_max(benchmark::State& state) {
  parallelImplGrowByMax(
      state,
      []() { return tbb::concurrent_vector<int>(); },
      [](tbb::concurrent_vector<int>& c, int i, int end) {
        auto it = c.grow_by(end - i);
        for (; i != end; ++i, ++it) {
          *it = i;
        }
      });
}
#endif // !BENCHMARK_WITHOUT_TBB

void BM_dispenso_parallel_grow_by_max(benchmark::State& state) {
  parallelImplGrowByMax(
      state,
      []() { return dispenso::ConcurrentVector<int>(); },
      [](dispenso::ConcurrentVector<int>& c, int i, int end) {
        c.grow_by_generator(end - i, [i]() mutable { return i++; });
      });
}

BENCHMARK(BM_std_push_back_serial);
BENCHMARK(BM_deque_push_back_serial);
#if !defined(BENCHMARK_WITHOUT_TBB)
BENCHMARK(BM_tbb_push_back_serial);
#endif // !BENCHMARK_WITHOUT_TBB
BENCHMARK(BM_dispenso_push_back_serial);

#if !defined(BENCHMARK_WITHOUT_TBB)
BENCHMARK(BM_tbb_push_back_serial_grow_by_alternative);
#endif // !BENCHMARK_WITHOUT_TBB
BENCHMARK(BM_dispenso_push_back_serial_grow_by_alternative);

BENCHMARK(BM_std_push_back_serial_reserve);
#if !defined(BENCHMARK_WITHOUT_TBB)
BENCHMARK(BM_tbb_push_back_serial_reserve);
#endif // !BENCHMARK_WITHOUT_TBB
BENCHMARK(BM_dispenso_push_back_serial_reserve);

#if !defined(BENCHMARK_WITHOUT_TBB)
BENCHMARK(BM_tbb_push_back_serial_grow_by_alternative_reserve);
#endif // !BENCHMARK_WITHOUT_TBB
BENCHMARK(BM_dispenso_push_back_serial_grow_by_alternative_reserve);

BENCHMARK(BM_std_iterate);
BENCHMARK(BM_deque_iterate);
#if !defined(BENCHMARK_WITHOUT_TBB)
BENCHMARK(BM_tbb_iterate);
#endif // !BENCHMARK_WITHOUT_TBB
BENCHMARK(BM_dispenso_iterate);

BENCHMARK(BM_std_iterate_reverse);
BENCHMARK(BM_deque_iterate_reverse);
#if !defined(BENCHMARK_WITHOUT_TBB)
BENCHMARK(BM_tbb_iterate_reverse);
#endif // !BENCHMARK_WITHOUT_TBB
BENCHMARK(BM_dispenso_iterate_reverse);

BENCHMARK(BM_std_lower_bound);
BENCHMARK(BM_deque_lower_bound);
#if !defined(BENCHMARK_WITHOUT_TBB)
BENCHMARK(BM_tbb_lower_bound);
#endif // !BENCHMARK_WITHOUT_TBB
BENCHMARK(BM_dispenso_lower_bound);

BENCHMARK(BM_std_index);
BENCHMARK(BM_deque_index);
#if !defined(BENCHMARK_WITHOUT_TBB)
BENCHMARK(BM_tbb_index);
#endif // !BENCHMARK_WITHOUT_TBB
BENCHMARK(BM_dispenso_index);

BENCHMARK(BM_std_random);
BENCHMARK(BM_deque_random);
#if !defined(BENCHMARK_WITHOUT_TBB)
BENCHMARK(BM_tbb_random);
#endif // !BENCHMARK_WITHOUT_TBB
BENCHMARK(BM_dispenso_random);

BENCHMARK(BM_std_parallel);
BENCHMARK(BM_deque_parallel);
#if !defined(BENCHMARK_WITHOUT_TBB)
BENCHMARK(BM_tbb_parallel);
#endif // !BENCHMARK_WITHOUT_TBB
BENCHMARK(BM_dispenso_parallel);

BENCHMARK(BM_std_parallel_reserve);
#if !defined(BENCHMARK_WITHOUT_TBB)
BENCHMARK(BM_tbb_parallel_reserve);
#endif // !BENCHMARK_WITHOUT_TBB
BENCHMARK(BM_dispenso_parallel_reserve);

BENCHMARK(BM_std_parallel_clear);
BENCHMARK(BM_deque_parallel_clear);
#if !defined(BENCHMARK_WITHOUT_TBB)
BENCHMARK(BM_tbb_parallel_clear);
#endif // !BENCHMARK_WITHOUT_TBB
BENCHMARK(BM_dispenso_parallel_clear);

BENCHMARK(BM_std_parallel_grow_by_10);
BENCHMARK(BM_deque_parallel_grow_by_10);
#if !defined(BENCHMARK_WITHOUT_TBB)
BENCHMARK(BM_tbb_parallel_grow_by_10);
#endif // !BENCHMARK_WITHOUT_TBB
BENCHMARK(BM_dispenso_parallel_grow_by_10);

BENCHMARK(BM_std_parallel_grow_by_100);
BENCHMARK(BM_deque_parallel_grow_by_100);
#if !defined(BENCHMARK_WITHOUT_TBB)
BENCHMARK(BM_tbb_parallel_grow_by_100);
#endif // !BENCHMARK_WITHOUT_TBB
BENCHMARK(BM_dispenso_parallel_grow_by_100);

BENCHMARK(BM_std_parallel_grow_by_max);
BENCHMARK(BM_deque_parallel_grow_by_max);
#if !defined(BENCHMARK_WITHOUT_TBB)
BENCHMARK(BM_tbb_parallel_grow_by_max);
#endif // !BENCHMARK_WITHOUT_TBB
BENCHMARK(BM_dispenso_parallel_grow_by_max);

BENCHMARK_MAIN();


================================================
FILE: benchmarks/fast_math/CMakeLists.txt
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

cmake_minimum_required(VERSION 3.12)

# Apply SIMD ISA flags and Highway linkage from parent.
set(FAST_MATH_EXTRA_FLAGS ${DISPENSO_FAST_MATH_SIMD_FLAGS})
set(FAST_MATH_EXTRA_LIBS "")
if(DISPENSO_FAST_MATH_HIGHWAY AND TARGET hwy)
  list(APPEND FAST_MATH_EXTRA_LIBS hwy)
  # Treat Highway as a system include to suppress warnings from its headers.
  if(MSVC)
    list(APPEND FAST_MATH_EXTRA_FLAGS /external:I ${hwy_SOURCE_DIR})
  else()
    list(APPEND FAST_MATH_EXTRA_FLAGS -isystem${hwy_SOURCE_DIR})
  endif()
endif()

file(GLOB FAST_MATH_BENCHMARK_FILES CONFIGURE_DEPENDS *.cpp)

foreach(BENCHMARK_FILE ${FAST_MATH_BENCHMARK_FILES})
  set(BENCHMARK_NAME)
  get_filename_component(BENCHMARK_NAME ${BENCHMARK_FILE} NAME_WE)
  set(BENCHMARK_NAME "fast_math_${BENCHMARK_NAME}")
  add_executable(${BENCHMARK_NAME} ${BENCHMARK_FILE})
  target_compile_features(${BENCHMARK_NAME} PRIVATE cxx_std_17)
  target_compile_options(${BENCHMARK_NAME} PRIVATE ${FAST_MATH_EXTRA_FLAGS})
  target_link_libraries(${BENCHMARK_NAME} ${REQUIRED_LIBS} ${OPTIONAL_LIBS} ${FAST_MATH_EXTRA_LIBS})
endforeach()


================================================
FILE: benchmarks/fast_math/avx512_benchmarks.cpp
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * This source code is licensed under the MIT license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include "benchmark_helpers.h"

#if defined(__AVX512F__)

namespace dfm = dispenso::fast_math;
namespace bench = dispenso::fast_math::bench;
using Flt = __m512;

// --- One-arg benchmarks ---

void BM_sin_avx512(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](auto x) { return dfm::sin(x); });
}
void BM_cos_avx512(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](auto x) { return dfm::cos(x); });
}
void BM_tan_avx512(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](auto x) { return dfm::tan(x); });
}
void BM_atan_avx512(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](auto x) { return dfm::atan(x); });
}
void BM_acos_avx512(benchmark::State& state) {
  bench::runBench(state, bench::acosInputs<Flt>(), [](auto x) { return dfm::acos(x); });
}
void BM_asin_avx512(benchmark::State& state) {
  bench::runBench(state, bench::acosInputs<Flt>(), [](auto x) { return dfm::asin(x); });
}

void BM_exp_avx512(benchmark::State& state) {
  bench::runBench(state, bench::expInputs<Flt>(), [](auto x) { return dfm::exp(x); });
}
void BM_exp2_avx512(benchmark::State& state) {
  bench::runBench(state, bench::expInputs<Flt>(), [](auto x) { return dfm::exp2(x); });
}
void BM_exp10_avx512(benchmark::State& state) {
  bench::runBench(state, bench::expInputs<Flt>(), [](auto x) { return dfm::exp10(x); });
}
void BM_expm1_avx512(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](auto x) { return dfm::expm1(x); });
}

void BM_log_avx512(benchmark::State& state) {
  bench::runBench(state, bench::logInputs<Flt>(), [](auto x) { return dfm::log(x); });
}
void BM_log2_avx512(benchmark::State& state) {
  bench::runBench(state, bench::logInputs<Flt>(), [](auto x) { return dfm::log2(x); });
}
void BM_log10_avx512(benchmark::State& state) {
  bench::runBench(state, bench::logInputs<Flt>(), [](auto x) { return dfm::log10(x); });
}
void BM_log1p_avx512(benchmark::State& state) {
  bench::runBench(
      state, bench::sinInputs<Flt>(), [](Flt x) { return dfm::log1p(_mm512_abs_ps(x)); });
}

void BM_cbrt_avx512(benchmark::State& state) {
  bench::runBench(state, bench::logInputs<Flt>(), [](auto x) { return dfm::cbrt(x); });
}

void BM_frexp_avx512(benchmark::State& state) {
  bench::runBench(state, bench::logInputs<Flt>(), [](auto x) {
    dfm::IntType_t<Flt> e;
    return dfm::frexp(x, &e);
  });
}
void BM_ldexp_avx512(benchmark::State& state) {
  bench::runBench(
      state, bench::logInputs<Flt>(), [](auto x) { return dfm::ldexp(x, _mm512_set1_epi32(3)); });
}

void BM_tanh_avx512(benchmark::State& state) {
  bench::runBench(state, bench::tanhInputs<Flt>(), [](auto x) { return dfm::tanh(x); });
}
void BM_erf_avx512(benchmark::State& state) {
  bench::runBench(state, bench::erfInputs<Flt>(), [](auto x) { return dfm::erf(x); });
}

// --- Two-arg benchmarks ---

void BM_atan2_avx512(benchmark::State& state) {
  bench::runBench2(state, bench::expInputs<Flt>(), bench::sinInputs<Flt>(), [](auto y, auto x) {
    return dfm::atan2(y, x);
  });
}

void BM_hypot_avx512(benchmark::State& state) {
  bench::runBench2(state, bench::hypotInputs<Flt>(), bench::sinInputs<Flt>(), [](auto x, auto y) {
    return dfm::hypot(x, y);
  });
}
void BM_hypot_avx512_bounds(benchmark::State& state) {
  bench::runBench2(state, bench::hypotInputs<Flt>(), bench::sinInputs<Flt>(), [](auto x, auto y) {
    return dfm::hypot<Flt, dfm::MaxAccuracyTraits>(x, y);
  });
}

void BM_pow_avx512(benchmark::State& state) {
  bench::runBench2(
      state, bench::powBaseInputs<Flt>(), bench::powExpInputs<Flt>(), [](auto b, auto e) {
        return dfm::pow(b, e);
      });
}
void BM_pow_avx512_accurate(benchmark::State& state) {
  bench::runBench2(
      state, bench::powBaseInputs<Flt>(), bench::powExpInputs<Flt>(), [](auto b, auto e) {
        return dfm::pow<Flt, dfm::MaxAccuracyTraits>(b, e);
      });
}
void BM_pow_avx512_scalar_exp(benchmark::State& state) {
  bench::runBench(state, bench::powBaseInputs<Flt>(), [](auto x) { return dfm::pow(x, 2.5f); });
}

// --- Libc-packed comparisons (AVX-512-specific, kept hand-written) ---

void BM_hypot_libc_avx512(benchmark::State& state) {
  const auto& inputs = bench::hypotInputs<Flt>();
  const auto& inputs2 = bench::sinInputs<Flt>();
  size_t idx = 0;
  Flt sum = _mm512_setzero_ps();
  for (auto _ : state) {
    (void)_;
    alignas(64) float x[16], y[16], r[16];
    _mm512_store_ps(x, inputs[idx]);
    _mm512_store_ps(y, inputs2[idx]);
    for (int32_t i = 0; i < 16; ++i) {
      r[i] = ::hypotf(x[i], y[i]);
    }
    sum = _mm512_add_ps(sum, _mm512_load_ps(r));
    idx = (idx + 1) & bench::kInputsMask;
  }
  state.SetItemsProcessed(state.iterations() * 16);
  bench::consumeResult(sum);
}

void BM_pow_libc_avx512(benchmark::State& state) {
  const auto& bases = bench::powBaseInputs<Flt>();
  const auto& exps = bench::powExpInputs<Flt>();
  size_t idx = 0;
  Flt sum = _mm512_setzero_ps();
  for (auto _ : state) {
    (void)_;
    alignas(64) float x[16], y[16], r[16];
    _mm512_store_ps(x, bases[idx]);
    _mm512_store_ps(y, exps[idx]);
    for (int32_t j = 0; j < 16; ++j)
      r[j] = ::powf(x[j], y[j]);
    sum = _mm512_add_ps(sum, _mm512_load_ps(r));
    idx = (idx + 1) & bench::kInputsMask;
  }
  state.SetItemsProcessed(state.iterations() * 16);
  bench::consumeResult(sum);
}

// --- Registrations ---

BENCHMARK(BM_sin_avx512);
BENCHMARK(BM_cos_avx512);
BENCHMARK(BM_tan_avx512);
BENCHMARK(BM_atan_avx512);
BENCHMARK(BM_acos_avx512);
BENCHMARK(BM_asin_avx512);
BENCHMARK(BM_exp_avx512);
BENCHMARK(BM_exp2_avx512);
BENCHMARK(BM_exp10_avx512);
BENCHMARK(BM_expm1_avx512);
BENCHMARK(BM_log_avx512);
BENCHMARK(BM_log2_avx512);
BENCHMARK(BM_log10_avx512);
BENCHMARK(BM_log1p_avx512);
BENCHMARK(BM_cbrt_avx512);
BENCHMARK(BM_frexp_avx512);
BENCHMARK(BM_ldexp_avx512);
BENCHMARK(BM_tanh_avx512);
BENCHMARK(BM_erf_avx512);
BENCHMARK(BM_atan2_avx512);
BENCHMARK(BM_hypot_avx512);
BENCHMARK(BM_hypot_avx512_bounds);
BENCHMARK(BM_hypot_libc_avx512);
BENCHMARK(BM_pow_avx512);
BENCHMARK(BM_pow_avx512_accurate);
BENCHMARK(BM_pow_avx512_scalar_exp);
BENCHMARK(BM_pow_libc_avx512);

#else // !defined(__AVX512F__)

int main() {
  std::cout << "AVX-512 not available, skipping benchmarks." << std::endl;
  return 0;
}

#endif // defined(__AVX512F__)


================================================
FILE: benchmarks/fast_math/avx_benchmarks.cpp
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * This source code is licensed under the MIT license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include "benchmark_helpers.h"

#if defined(__AVX2__)

namespace dfm = dispenso::fast_math;
namespace bench = dispenso::fast_math::bench;
using Flt = __m256;

struct BoundsTraits {
  static constexpr bool kMaxAccuracy = false;
  static constexpr bool kBoundsValues = true;
};

// --- One-arg benchmarks ---

void BM_sin_avx(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](auto x) { return dfm::sin(x); });
}
void BM_sin_avx_accurate(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](auto x) {
    return dfm::sin<Flt, dfm::MaxAccuracyTraits>(x);
  });
}
void BM_cos_avx(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](auto x) { return dfm::cos(x); });
}
void BM_cos_avx_accurate(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](auto x) {
    return dfm::cos<Flt, dfm::MaxAccuracyTraits>(x);
  });
}
void BM_tan_avx(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](auto x) { return dfm::tan(x); });
}
void BM_atan_avx(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](auto x) { return dfm::atan(x); });
}
void BM_acos_avx(benchmark::State& state) {
  bench::runBench(state, bench::acosInputs<Flt>(), [](auto x) { return dfm::acos(x); });
}
void BM_asin_avx(benchmark::State& state) {
  bench::runBench(state, bench::acosInputs<Flt>(), [](auto x) { return dfm::asin(x); });
}

void BM_exp_avx(benchmark::State& state) {
  bench::runBench(state, bench::expInputs<Flt>(), [](auto x) { return dfm::exp(x); });
}
void BM_exp_avx_accurate(benchmark::State& state) {
  bench::runBench(state, bench::expInputs<Flt>(), [](auto x) {
    return dfm::exp<Flt, dfm::MaxAccuracyTraits>(x);
  });
}
void BM_exp_avx_bounds(benchmark::State& state) {
  bench::runBench(
      state, bench::expInputs<Flt>(), [](auto x) { return dfm::exp<Flt, BoundsTraits>(x); });
}
void BM_exp2_avx(benchmark::State& state) {
  bench::runBench(state, bench::expInputs<Flt>(), [](auto x) { return dfm::exp2(x); });
}
void BM_exp10_avx(benchmark::State& state) {
  bench::runBench(state, bench::expInputs<Flt>(), [](auto x) { return dfm::exp10(x); });
}
void BM_expm1_avx(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](auto x) { return dfm::expm1(x); });
}

void BM_log_avx(benchmark::State& state) {
  bench::runBench(state, bench::logInputs<Flt>(), [](auto x) { return dfm::log(x); });
}
void BM_log_avx_accurate(benchmark::State& state) {
  bench::runBench(state, bench::logInputs<Flt>(), [](auto x) {
    return dfm::log<Flt, dfm::MaxAccuracyTraits>(x);
  });
}
void BM_log2_avx(benchmark::State& state) {
  bench::runBench(state, bench::logInputs<Flt>(), [](auto x) { return dfm::log2(x); });
}
void BM_log10_avx(benchmark::State& state) {
  bench::runBench(state, bench::logInputs<Flt>(), [](auto x) { return dfm::log10(x); });
}
void BM_log1p_avx(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](Flt x) {
    Flt ax = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), x);
    return dfm::log1p(ax);
  });
}

void BM_cbrt_avx(benchmark::State& state) {
  bench::runBench(state, bench::logInputs<Flt>(), [](auto x) { return dfm::cbrt(x); });
}
void BM_cbrt_avx_accurate(benchmark::State& state) {
  bench::runBench(state, bench::logInputs<Flt>(), [](auto x) {
    return dfm::cbrt<Flt, dfm::MaxAccuracyTraits>(x);
  });
}

void BM_frexp_avx(benchmark::State& state) {
  bench::runBench(state, bench::logInputs<Flt>(), [](auto x) {
    dfm::IntType_t<Flt> e;
    return dfm::frexp(x, &e);
  });
}
void BM_ldexp_avx(benchmark::State& state) {
  bench::runBench(
      state, bench::logInputs<Flt>(), [](auto x) { return dfm::ldexp(x, _mm256_set1_epi32(3)); });
}

void BM_tanh_avx(benchmark::State& state) {
  bench::runBench(state, bench::tanhInputs<Flt>(), [](auto x) { return dfm::tanh(x); });
}
void BM_erf_avx(benchmark::State& state) {
  bench::runBench(state, bench::erfInputs<Flt>(), [](auto x) { return dfm::erf(x); });
}

// --- Two-arg benchmarks ---

void BM_atan2_avx(benchmark::State& state) {
  bench::runBench2(state, bench::expInputs<Flt>(), bench::sinInputs<Flt>(), [](auto y, auto x) {
    return dfm::atan2(y, x);
  });
}

void BM_hypot_avx(benchmark::State& state) {
  bench::runBench2(state, bench::hypotInputs<Flt>(), bench::sinInputs<Flt>(), [](auto x, auto y) {
    return dfm::hypot(x, y);
  });
}
void BM_hypot_avx_bounds(benchmark::State& state) {
  bench::runBench2(state, bench::hypotInputs<Flt>(), bench::sinInputs<Flt>(), [](auto x, auto y) {
    return dfm::hypot<Flt, dfm::MaxAccuracyTraits>(x, y);
  });
}

void BM_pow_avx(benchmark::State& state) {
  bench::runBench2(
      state, bench::powBaseInputs<Flt>(), bench::powExpInputs<Flt>(), [](auto b, auto e) {
        return dfm::pow(b, e);
      });
}
void BM_pow_avx_accurate(benchmark::State& state) {
  bench::runBench2(
      state, bench::powBaseInputs<Flt>(), bench::powExpInputs<Flt>(), [](auto b, auto e) {
        return dfm::pow<Flt, dfm::MaxAccuracyTraits>(b, e);
      });
}
void BM_pow_avx_scalar_exp(benchmark::State& state) {
  bench::runBench(state, bench::powBaseInputs<Flt>(), [](auto x) { return dfm::pow(x, 2.5f); });
}

// --- Libc-packed comparison (AVX-specific, kept hand-written) ---

void BM_pow_libc_avx(benchmark::State& state) {
  const auto& bases = bench::powBaseInputs<Flt>();
  const auto& exps = bench::powExpInputs<Flt>();
  size_t idx = 0;
  Flt sum = _mm256_setzero_ps();
  for (auto _ : state) {
    (void)_;
    alignas(32) float x[8], y[8], r[8];
    _mm256_store_ps(x, bases[idx]);
    _mm256_store_ps(y, exps[idx]);
    for (int32_t i = 0; i < 8; ++i) {
      r[i] = ::powf(x[i], y[i]);
    }
    sum = _mm256_add_ps(sum, _mm256_load_ps(r));
    idx = (idx + 1) & bench::kInputsMask;
  }
  state.SetItemsProcessed(state.iterations() * 8);
  bench::consumeResult(sum);
}

// --- Registrations ---

BENCHMARK(BM_sin_avx);
BENCHMARK(BM_sin_avx_accurate);
BENCHMARK(BM_cos_avx);
BENCHMARK(BM_cos_avx_accurate);
BENCHMARK(BM_tan_avx);
BENCHMARK(BM_atan_avx);
BENCHMARK(BM_acos_avx);
BENCHMARK(BM_asin_avx);
BENCHMARK(BM_exp_avx);
BENCHMARK(BM_exp_avx_accurate);
BENCHMARK(BM_exp_avx_bounds);
BENCHMARK(BM_exp2_avx);
BENCHMARK(BM_exp10_avx);
BENCHMARK(BM_expm1_avx);
BENCHMARK(BM_log_avx);
BENCHMARK(BM_log_avx_accurate);
BENCHMARK(BM_log2_avx);
BENCHMARK(BM_log10_avx);
BENCHMARK(BM_log1p_avx);
BENCHMARK(BM_cbrt_avx);
BENCHMARK(BM_cbrt_avx_accurate);
BENCHMARK(BM_frexp_avx);
BENCHMARK(BM_ldexp_avx);
BENCHMARK(BM_tanh_avx);
BENCHMARK(BM_erf_avx);
BENCHMARK(BM_atan2_avx);
BENCHMARK(BM_hypot_avx);
BENCHMARK(BM_hypot_avx_bounds);
BENCHMARK(BM_pow_avx);
BENCHMARK(BM_pow_avx_accurate);
BENCHMARK(BM_pow_avx_scalar_exp);
BENCHMARK(BM_pow_libc_avx);

#else // !defined(__AVX2__)

int main() {
  std::cout << "AVX2 not available, skipping benchmarks." << std::endl;
  return 0;
}

#endif // defined(__AVX2__)


================================================
FILE: benchmarks/fast_math/benchmark_helpers.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * This source code is licensed under the MIT license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <cmath>
#include <cstdint>
#include <iostream>
#include <vector>

#include <benchmark/benchmark.h>
#include <dispenso/fast_math/fast_math.h>

#if __has_include("hwy/highway.h")
#include "hwy/highway.h"
#endif

namespace dispenso {
namespace fast_math {
namespace bench {

constexpr size_t kNumInputs = 4096;
constexpr size_t kInputsMask = 4095;
constexpr int32_t kMaxBenchLanes = 64;

// --- Per-type primitives ---
//
// Each SIMD type needs: laneCount, loadVec, zeroVec, addVec, consumeResult.
// These are template specializations so the compiler sees the exact operations
// at each call site, ensuring zero overhead.

template <typename Flt>
inline int32_t laneCount();
template <typename Flt>
inline Flt loadVec(const float* data);
template <typename Flt>
inline Flt zeroVec();
template <typename Flt>
inline Flt addVec(Flt a, Flt b);
template <typename Flt>
inline void consumeResult(Flt sum);

// --- Scalar (float) ---

template <>
inline int32_t laneCount<float>() {
  return 1;
}
template <>
inline float loadVec<float>(const float* data) {
  return *data;
}
template <>
inline float zeroVec<float>() {
  return 0.0f;
}
template <>
inline float addVec<float>(float a, float b) {
  return a + b;
}
template <>
inline void consumeResult<float>(float sum) {
  std::cout << sum << std::endl;
}

// --- SSE (__m128) ---

#if defined(__SSE4_1__)

template <>
inline int32_t laneCount<__m128>() {
  return 4;
}
template <>
inline __m128 loadVec<__m128>(const float* data) {
  return _mm_load_ps(data);
}
template <>
inline __m128 zeroVec<__m128>() {
  return _mm_setzero_ps();
}
template <>
inline __m128 addVec<__m128>(__m128 a, __m128 b) {
  return _mm_add_ps(a, b);
}
template <>
inline void consumeResult<__m128>(__m128 sum) {
  alignas(16) float buf[4];
  _mm_store_ps(buf, sum);
  std::cout << buf[0] + buf[1] + buf[2] + buf[3] << std::endl;
}

#endif // __SSE4_1__

// --- AVX (__m256) ---

#if defined(__AVX2__)

template <>
inline int32_t laneCount<__m256>() {
  return 8;
}
template <>
inline __m256 loadVec<__m256>(const float* data) {
  return _mm256_load_ps(data);
}
template <>
inline __m256 zeroVec<__m256>() {
  return _mm256_setzero_ps();
}
template <>
inline __m256 addVec<__m256>(__m256 a, __m256 b) {
  return _mm256_add_ps(a, b);
}
template <>
inline void consumeResult<__m256>(__m256 sum) {
  alignas(32) float buf[8];
  _mm256_store_ps(buf, sum);
  float total = 0.0f;
  for (int32_t i = 0; i < 8; ++i)
    total += buf[i];
  std::cout << total << std::endl;
}

#endif // __AVX2__

// --- AVX-512 (__m512) ---

#if defined(__AVX512F__)

template <>
inline int32_t laneCount<__m512>() {
  return 16;
}
template <>
inline __m512 loadVec<__m512>(const float* data) {
  return _mm512_load_ps(data);
}
template <>
inline __m512 zeroVec<__m512>() {
  return _mm512_setzero_ps();
}
template <>
inline __m512 addVec<__m512>(__m512 a, __m512 b) {
  return _mm512_add_ps(a, b);
}
template <>
inline void consumeResult<__m512>(__m512 sum) {
  alignas(64) float buf[16];
  _mm512_store_ps(buf, sum);
  float total = 0.0f;
  for (int32_t i = 0; i < 16; ++i)
    total += buf[i];
  std::cout << total << std::endl;
}

#endif // __AVX512F__

// --- NEON (float32x4_t) ---

#if defined(__aarch64__)

template <>
inline int32_t laneCount<float32x4_t>() {
  return 4;
}
template <>
inline float32x4_t loadVec<float32x4_t>(const float* data) {
  return vld1q_f32(data);
}
template <>
inline float32x4_t zeroVec<float32x4_t>() {
  return vdupq_n_f32(0.0f);
}
template <>
inline float32x4_t addVec<float32x4_t>(float32x4_t a, float32x4_t b) {
  return vaddq_f32(a, b);
}
template <>
inline void consumeResult<float32x4_t>(float32x4_t sum) {
  alignas(16) float buf[4];
  vst1q_f32(buf, sum);
  std::cout << buf[0] + buf[1] + buf[2] + buf[3] << std::endl;
}

#endif // __aarch64__

// --- Highway (HwyFloat) ---

#if __has_include("hwy/highway.h")

namespace hn = hwy::HWY_NAMESPACE;

template <>
inline int32_t laneCount<HwyFloat>() {
  return static_cast<int32_t>(hn::Lanes(HwyFloatTag()));
}
template <>
inline HwyFloat loadVec<HwyFloat>(const float* data) {
  return hn::LoadU(HwyFloatTag(), data);
}
template <>
inline HwyFloat zeroVec<HwyFloat>() {
  return hn::Zero(HwyFloatTag());
}
template <>
inline HwyFloat addVec<HwyFloat>(HwyFloat a, HwyFloat b) {
  return hn::Add(a.v, b.v);
}
template <>
inline void consumeResult<HwyFloat>(HwyFloat sum) {
  const HwyFloatTag d;
  constexpr size_t kMaxLanes = HWY_MAX_BYTES / sizeof(float);
  HWY_ALIGN float buf[kMaxLanes];
  hn::StoreU(sum.v, d, buf);
  float total = 0.0f;
  const size_t N = hn::Lanes(d);
  for (size_t i = 0; i < N; ++i) {
    total += buf[i];
  }
  std::cout << total << std::endl;
}

#endif // hwy/highway.h

// --- Input generation ---
//
// makeInputs<Flt>(lo, hi) generates kNumInputs vectors covering the scalar
// range [lo, hi] at delta = (hi - lo) / kNumInputs spacing. Each SIMD vector
// packs N consecutive values, so the total coverage is N * (hi - lo).

template <typename Flt>
inline std::vector<Flt> makeInputs(float lo, float hi) {
  const int32_t N = laneCount<Flt>();
  float delta = (hi - lo) / static_cast<float>(kNumInputs);
  std::vector<Flt> inputs;
  inputs.reserve(kNumInputs);
  alignas(64) float buf[kMaxBenchLanes];
  float f = lo;
  for (size_t i = 0; i < kNumInputs; ++i) {
    for (int32_t j = 0; j < N; ++j) {
      buf[j] = f + static_cast<float>(j) * delta;
    }
    inputs.push_back(loadVec<Flt>(buf));
    f += static_cast<float>(N) * delta;
  }
  return inputs;
}

// --- Pre-defined input factories ---
//
// These match the ranges used across all SIMD benchmark files.

template <typename Flt>
inline const std::vector<Flt>& sinInputs() {
  static auto v = makeInputs<Flt>(static_cast<float>(-M_PI / 2.0), static_cast<float>(M_PI / 2.0));
  return v;
}

template <typename Flt>
inline const std::vector<Flt>& expInputs() {
  static auto v = makeInputs<Flt>(-10.0f, 10.0f);
  return v;
}

template <typename Flt>
inline const std::vector<Flt>& logInputs() {
  static auto v = makeInputs<Flt>(0.001f, 10000.0f);
  return v;
}

template <typename Flt>
inline const std::vector<Flt>& acosInputs() {
  static auto v = makeInputs<Flt>(-0.999f, 0.999f);
  return v;
}

template <typename Flt>
inline const std::vector<Flt>& hypotInputs() {
  static auto v = makeInputs<Flt>(-100000.0f, 100000.0f);
  return v;
}

template <typename Flt>
inline const std::vector<Flt>& tanhInputs() {
  static auto v = makeInputs<Flt>(-5.0f, 5.0f);
  return v;
}

template <typename Flt>
inline const std::vector<Flt>& erfInputs() {
  static auto v = makeInputs<Flt>(-4.0f, 4.0f);
  return v;
}

template <typename Flt>
inline const std::vector<Flt>& powBaseInputs() {
  static auto v = makeInputs<Flt>(0.01f, 100.0f);
  return v;
}

template <typename Flt>
inline const std::vector<Flt>& powExpInputs() {
  static auto v = makeInputs<Flt>(-8.0f, 8.0f);
  return v;
}

// --- Benchmark runners ---
//
// runBench: one-arg function benchmark. Func is a template parameter (lambda),
// so the compiler sees the exact call target and inlines at -O2. This produces
// the same assembly as hand-written code.
//
// runBench2: two-arg function benchmark (atan2, hypot, pow).

template <typename Flt, typename Func>
inline void runBench(benchmark::State& state, const std::vector<Flt>& inputs, Func fn) {
  size_t idx = 0;
  Flt sum = zeroVec<Flt>();
  for (auto _ : state) {
    (void)_;
    sum = addVec<Flt>(sum, fn(inputs[idx]));
    idx = (idx + 1) & kInputsMask;
  }
  state.SetItemsProcessed(state.iterations() * static_cast<int64_t>(laneCount<Flt>()));
  consumeResult(sum);
}

template <typename Flt, typename Func>
inline void runBench2(
    benchmark::State& state,
    const std::vector<Flt>& xInputs,
    const std::vector<Flt>& yInputs,
    Func fn) {
  size_t idx = 0;
  Flt sum = zeroVec<Flt>();
  for (auto _ : state) {
    (void)_;
    sum = addVec<Flt>(sum, fn(xInputs[idx], yInputs[idx]));
    idx = (idx + 1) & kInputsMask;
  }
  state.SetItemsProcessed(state.iterations() * static_cast<int64_t>(laneCount<Flt>()));
  consumeResult(sum);
}

} // namespace bench
} // namespace fast_math
} // namespace dispenso


================================================
FILE: benchmarks/fast_math/benchmarks.cpp
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * This source code is licensed under the MIT license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include "benchmark_helpers.h"

namespace dfm = dispenso::fast_math;
namespace bench = dispenso::fast_math::bench;

constexpr size_t kNumInputs = 4096;
constexpr size_t kInputsMask = 4095;

struct BoundsTraits {
  static constexpr bool kMaxAccuracy = false;
  static constexpr bool kBoundsValues = true;
};

// --- Input generators (scalar-specific ranges, different from SIMD) ---

const std::vector<float>& acosInputs() {
  static std::vector<float> inputs = []() {
    float delta = 2.0f / kNumInputs;
    std::vector<float> inp;
    for (float f = -1.0f; f <= 1.0f; f += delta) {
      inp.push_back(f);
    }
    while (inp.size() < kNumInputs) {
      inp.push_back(inp.back());
    }
    return inp;
  }();
  return inputs;
}

const std::vector<float>& cbrtInputs() {
  static std::vector<float> inputs = []() {
    float delta = 50000.0f / kNumInputs;
    std::vector<float> inp;
    for (float f = -50000.f; f <= 50000.f; f += delta) {
      inp.push_back(f);
    }
    while (inp.size() < kNumInputs) {
      inp.push_back(inp.back());
    }
    return inp;
  }();
  return inputs;
}

const std::vector<float>& sinInputs() {
  static std::vector<float> inputs = []() {
    float delta = M_PI / kNumInputs;
    std::vector<float> inp;
    for (float f = -M_PI / 2.0; f <= M_PI / 2.0; f += delta) {
      inp.push_back(f);
    }
    while (inp.size() < kNumInputs) {
      inp.push_back(inp.back());
    }
    return inp;
  }();
  return inputs;
}

const std::vector<float>& logInputs() {
  static std::vector<float> inputs = []() {
    float delta = 10000.0f / kNumInputs;
    std::vector<float> inp;
    for (float f = 0.0f; f <= 10000.0f; f += delta) {
      inp.push_back(f);
    }
    while (inp.size() < kNumInputs) {
      inp.push_back(inp.back());
    }
    return inp;
  }();
  return inputs;
}

const std::vector<float>& hypotInputs() {
  static std::vector<float> inputs = []() {
    float delta = 200000.0f / kNumInputs;
    std::vector<float> inp;
    for (float f = -100000.f; f <= 100000.f; f += delta) {
      inp.push_back(f);
    }
    while (inp.size() < kNumInputs) {
      inp.push_back(inp.back());
    }
    return inp;
  }();
  return inputs;
}

const std::vector<float>& powBaseInputs() {
  static std::vector<float> inputs = []() {
    float delta = 99.99f / kNumInputs;
    std::vector<float> inp;
    for (float f = 0.01f; f <= 100.0f; f += delta) {
      inp.push_back(f);
    }
    while (inp.size() < kNumInputs) {
      inp.push_back(inp.back());
    }
    return inp;
  }();
  return inputs;
}

const std::vector<float>& powExpInputs() {
  static std::vector<float> inputs = []() {
    float delta = 16.0f / kNumInputs;
    std::vector<float> inp;
    for (float f = -8.0f; f <= 8.0f; f += delta) {
      inp.push_back(f);
    }
    while (inp.size() < kNumInputs) {
      inp.push_back(inp.back());
    }
    return inp;
  }();
  return inputs;
}

const std::vector<float>& tanhInputs() {
  static std::vector<float> inputs = []() {
    float delta = 10.0f / kNumInputs;
    std::vector<float> inp;
    for (float f = -5.0f; inp.size() < kNumInputs; f += delta) {
      inp.push_back(f);
    }
    return inp;
  }();
  return inputs;
}

// --- Libc benchmarks ---

void BM_acos(benchmark::State& state) {
  bench::runBench(state, acosInputs(), [](auto x) { return ::acosf(x); });
}
void BM_asin(benchmark::State& state) {
  bench::runBench(state, acosInputs(), [](auto x) { return ::asinf(x); });
}
void BM_atan(benchmark::State& state) {
  bench::runBench(state, cbrtInputs(), [](auto x) { return ::atanf(x); });
}
void BM_cbrt(benchmark::State& state) {
  bench::runBench(state, cbrtInputs(), [](auto x) { return ::cbrtf(x); });
}
void BM_sin(benchmark::State& state) {
  bench::runBench(state, sinInputs(), [](auto x) { return ::sinf(x); });
}
void BM_cos(benchmark::State& state) {
  bench::runBench(state, sinInputs(), [](auto x) { return ::cosf(x); });
}
void BM_tan(benchmark::State& state) {
  bench::runBench(state, sinInputs(), [](auto x) { return ::tanf(x); });
}
void BM_exp(benchmark::State& state) {
  bench::runBench(state, sinInputs(), [](auto x) { return ::expf(x); });
}
void BM_exp2(benchmark::State& state) {
  bench::runBench(state, sinInputs(), [](auto x) { return ::exp2f(x); });
}
void BM_exp10(benchmark::State& state) {
  bench::runBench(state, sinInputs(), [](auto x) { return ::powf(10.0f, x); });
}
void BM_log(benchmark::State& state) {
  bench::runBench(state, logInputs(), [](auto x) { return ::logf(x); });
}
void BM_log2(benchmark::State& state) {
  bench::runBench(state, logInputs(), [](auto x) { return ::log2f(x); });
}
void BM_log10(benchmark::State& state) {
  bench::runBench(state, logInputs(), [](auto x) { return ::log10f(x); });
}
void BM_expm1(benchmark::State& state) {
  bench::runBench(state, sinInputs(), [](auto x) { return ::expm1f(x); });
}
void BM_log1p(benchmark::State& state) {
  bench::runBench(state, sinInputs(), [](auto x) { return ::log1pf(std::fabs(x)); });
}
void BM_tanh(benchmark::State& state) {
  bench::runBench(state, tanhInputs(), [](auto x) { return ::tanhf(x); });
}
void BM_sin_plus_cos(benchmark::State& state) {
  bench::runBench(state, sinInputs(), [](auto x) { return ::sinf(x) + ::cosf(x); });
}

void BM_atan2(benchmark::State& state) {
  bench::runBench2(state, cbrtInputs(), sinInputs(), [](auto y, auto x) { return ::atan2f(y, x); });
}
void BM_hypot(benchmark::State& state) {
  bench::runBench2(
      state, hypotInputs(), sinInputs(), [](auto x, auto y) { return ::hypotf(x, y); });
}
void BM_pow(benchmark::State& state) {
  bench::runBench2(
      state, powBaseInputs(), powExpInputs(), [](auto b, auto e) { return ::powf(b, e); });
}

// frexp and ldexp use non-standard loop patterns — kept hand-written.
void BM_frexp(benchmark::State& state) {
  const auto& inputs = sinInputs();
  size_t idx = 0;
  float sum = 0.0f;
  int exp;
  int64_t expSum = 0;
  for (auto _ : state) {
    (void)_;
    sum += ::frexpf(inputs[idx], &exp);
    expSum += exp;
    idx = (idx + 1) & kInputsMask;
  }
  state.SetItemsProcessed(state.iterations());
  std::cout << sum << " " << expSum << std::endl;
}

void BM_ldexp(benchmark::State& state) {
  const auto& inputs = sinInputs();
  size_t idx = 0;
  float sum = 0.0f;
  for (auto _ : state) {
    (void)_;
    sum += ::ldexpf(inputs[idx], idx & 7);
    idx = (idx + 1) & kInputsMask;
  }
  state.SetItemsProcessed(state.iterations());
  std::cout << sum << std::endl;
}

// --- Dispenso fast_math benchmarks ---

void BM_fastm_acos(benchmark::State& state) {
  bench::runBench(state, acosInputs(), [](auto x) { return dfm::acos(x); });
}
void BM_fastm_asin(benchmark::State& state) {
  bench::runBench(state, acosInputs(), [](auto x) { return dfm::asin(x); });
}
void BM_fastm_atan(benchmark::State& state) {
  bench::runBench(state, cbrtInputs(), [](auto x) { return dfm::atan(x); });
}
void BM_fastm_cbrt(benchmark::State& state) {
  bench::runBench(state, cbrtInputs(), [](auto x) { return dfm::cbrt(x); });
}
void BM_fastm_cbrt_accurate(benchmark::State& state) {
  bench::runBench(
      state, cbrtInputs(), [](auto x) { return dfm::cbrt<float, dfm::MaxAccuracyTraits>(x); });
}
void BM_fastm_sin(benchmark::State& state) {
  bench::runBench(state, sinInputs(), [](auto x) { return dfm::sin(x); });
}
void BM_fastm_sin_accurate(benchmark::State& state) {
  bench::runBench(
      state, sinInputs(), [](auto x) { return dfm::sin<float, dfm::MaxAccuracyTraits>(x); });
}
void BM_fastm_cos(benchmark::State& state) {
  bench::runBench(state, sinInputs(), [](auto x) { return dfm::cos(x); });
}
void BM_fastm_cos_accurate(benchmark::State& state) {
  bench::runBench(
      state, sinInputs(), [](auto x) { return dfm::cos<float, dfm::MaxAccuracyTraits>(x); });
}
void BM_fastm_tan(benchmark::State& state) {
  bench::runBench(state, sinInputs(), [](auto x) { return dfm::tan(x); });
}
void BM_fastm_tan_accurate(benchmark::State& state) {
  bench::runBench(
      state, sinInputs(), [](auto x) { return dfm::tan<float, dfm::MaxAccuracyTraits>(x); });
}
void BM_fastm_exp(benchmark::State& state) {
  bench::runBench(state, sinInputs(), [](auto x) { return dfm::exp(x); });
}
void BM_fastm_exp_bounds(benchmark::State& state) {
  bench::runBench(state, sinInputs(), [](auto x) { return dfm::exp<float, BoundsTraits>(x); });
}
void BM_fastm_exp_accurate(benchmark::State& state) {
  bench::runBench(
      state, sinInputs(), [](auto x) { return dfm::exp<float, dfm::MaxAccuracyTraits>(x); });
}
void BM_fastm_exp2(benchmark::State& state) {
  bench::runBench(state, sinInputs(), [](auto x) { return dfm::exp2(x); });
}
void BM_fastm_exp2_accurate(benchmark::State& state) {
  bench::runBench(
      state, sinInputs(), [](auto x) { return dfm::exp2<float, dfm::MaxAccuracyTraits>(x); });
}
void BM_fastm_exp10(benchmark::State& state) {
  bench::runBench(state, sinInputs(), [](auto x) { return dfm::exp10(x); });
}
void BM_fastm_exp10_accurate(benchmark::State& state) {
  bench::runBench(
      state, sinInputs(), [](auto x) { return dfm::exp10<float, dfm::MaxAccuracyTraits>(x); });
}
void BM_fastm_log(benchmark::State& state) {
  bench::runBench(state, logInputs(), [](auto x) { return dfm::log(x); });
}
void BM_fastm_log_accurate(benchmark::State& state) {
  bench::runBench(
      state, logInputs(), [](auto x) { return dfm::log<float, dfm::MaxAccuracyTraits>(x); });
}
void BM_fastm_log2(benchmark::State& state) {
  bench::runBench(state, logInputs(), [](auto x) { return dfm::log2(x); });
}
void BM_fastm_log2_accurate(benchmark::State& state) {
  bench::runBench(
      state, logInputs(), [](auto x) { return dfm::log2<float, dfm::MaxAccuracyTraits>(x); });
}
void BM_fastm_log10(benchmark::State& state) {
  bench::runBench(state, logInputs(), [](auto x) { return dfm::log10(x); });
}
void BM_fastm_log10_accurate(benchmark::State& state) {
  bench::runBench(
      state, logInputs(), [](auto x) { return dfm::log10<float, dfm::MaxAccuracyTraits>(x); });
}
void BM_fastm_expm1(benchmark::State& state) {
  bench::runBench(state, sinInputs(), [](auto x) { return dfm::expm1(x); });
}
void BM_fastm_log1p(benchmark::State& state) {
  bench::runBench(state, sinInputs(), [](auto x) { return dfm::log1p(std::fabs(x)); });
}
void BM_fastm_tanh(benchmark::State& state) {
  bench::runBench(state, tanhInputs(), [](auto x) { return dfm::tanh(x); });
}

void BM_fastm_atan2(benchmark::State& state) {
  bench::runBench2(
      state, cbrtInputs(), sinInputs(), [](auto y, auto x) { return dfm::atan2(y, x); });
}
void BM_fastm_atan2_bounds(benchmark::State& state) {
  bench::runBench2(state, cbrtInputs(), sinInputs(), [](auto y, auto x) {
    return dfm::atan2<float, BoundsTraits>(y, x);
  });
}
void BM_fastm_hypot(benchmark::State& state) {
  bench::runBench2(
      state, hypotInputs(), sinInputs(), [](auto x, auto y) { return dfm::hypot(x, y); });
}
void BM_fastm_hypot_bounds(benchmark::State& state) {
  bench::runBench2(state, hypotInputs(), sinInputs(), [](auto x, auto y) {
    return dfm::hypot<float, dfm::MaxAccuracyTraits>(x, y);
  });
}
void BM_naive_hypot(benchmark::State& state) {
  bench::runBench2(
      state, hypotInputs(), sinInputs(), [](auto x, auto y) { return sqrtf(fmaf(x, x, y * y)); });
}
void BM_fastm_pow(benchmark::State& state) {
  bench::runBench2(
      state, powBaseInputs(), powExpInputs(), [](auto b, auto e) { return dfm::pow(b, e); });
}
void BM_fastm_pow_accurate(benchmark::State& state) {
  bench::runBench2(state, powBaseInputs(), powExpInputs(), [](auto b, auto e) {
    return dfm::pow<float, dfm::MaxAccuracyTraits>(b, e);
  });
}

// frexp and ldexp use non-standard loop patterns — kept hand-written.
void BM_fastm_frexp(benchmark::State& state) {
  const auto& inputs = sinInputs();
  size_t idx = 0;
  float sum = 0.0f;
  int exp;
  int64_t expSum = 0;
  for (auto _ : state) {
    (void)_;
    sum += dfm::frexp(inputs[idx], &exp);
    expSum += exp;
    idx = (idx + 1) & kInputsMask;
  }
  state.SetItemsProcessed(state.iterations());
  std::cout << sum << " " << expSum << std::endl;
}

void BM_fastm_ldexp(benchmark::State& state) {
  const auto& inputs = sinInputs();
  size_t idx = 0;
  float sum = 0.0f;
  for (auto _ : state) {
    (void)_;
    sum += dfm::ldexp(inputs[idx], idx & 7);
    idx = (idx + 1) & kInputsMask;
  }
  state.SetItemsProcessed(state.iterations());
  std::cout << sum << std::endl;
}

// --- sincos / sincospi benchmarks ---

void BM_fastm_sin_plus_cos(benchmark::State& state) {
  bench::runBench(state, sinInputs(), [](auto x) { return dfm::sin(x) + dfm::cos(x); });
}
void BM_fastm_sincos(benchmark::State& state) {
  bench::runBench(state, sinInputs(), [](auto x) {
    float s, c;
    dfm::sincos(x, &s, &c);
    return s + c;
  });
}
void BM_fastm_sinpi(benchmark::State& state) {
  bench::runBench(state, sinInputs(), [](auto x) { return dfm::sinpi(x); });
}
void BM_fastm_cospi(benchmark::State& state) {
  bench::runBench(state, sinInputs(), [](auto x) { return dfm::cospi(x); });
}
void BM_fastm_sincospi(benchmark::State& state) {
  bench::runBench(state, sinInputs(), [](auto x) {
    float s, c;
    dfm::sincospi(x, &s, &c);
    return s + c;
  });
}

// --- Batch benchmarks: explicit SIMD via SseFloat (4-wide SSE) ---

#if defined(__SSE4_1__)
#include <dispenso/fast_math/float_traits_x86.h>

static void BM_batch_sinf(benchmark::State& state) {
  const auto& inputs = sinInputs();
  alignas(16) float outputs[kNumInputs];
  for (auto _ : state) {
    (void)_;
    for (size_t i = 0; i < kNumInputs; ++i) {
      outputs[i] = ::sinf(inputs[i]);
    }
    benchmark::DoNotOptimize(outputs);
  }
  state.SetItemsProcessed(state.iterations() * kNumInputs);
}

static void BM_batch_sin_scalar(benchmark::State& state) {
  const auto& inputs = sinInputs();
  alignas(16) float outputs[kNumInputs];
  for (auto _ : state) {
    (void)_;
    for (size_t i = 0; i < kNumInputs; ++i) {
      outputs[i] = dfm::sin(inputs[i]);
    }
    benchmark::DoNotOptimize(outputs);
  }
  state.SetItemsProcessed(state.iterations() * kNumInputs);
}

static void BM_batch_sin_sse(benchmark::State& state) {
  using namespace dispenso::fast_math;
  const auto& inputs = sinInputs();
  alignas(16) float outputs[kNumInputs];
  for (auto _ : state) {
    (void)_;
    for (size_t i = 0; i < kNumInputs; i += 4) {
      SseFloat x = _mm_loadu_ps(&inputs[i]);
      SseFloat r = sin<SseFloat>(x);
      _mm_storeu_ps(&outputs[i], r.v);
    }
    benchmark::DoNotOptimize(outputs);
  }
  state.SetItemsProcessed(state.iterations() * kNumInputs);
}

static void BM_batch_cos_scalar(benchmark::State& state) {
  const auto& inputs = sinInputs();
  alignas(16) float outputs[kNumInputs];
  for (auto _ : state) {
    (void)_;
    for (size_t i = 0; i < kNumInputs; ++i) {
      outputs[i] = dfm::cos(inputs[i]);
    }
    benchmark::DoNotOptimize(outputs);
  }
  state.SetItemsProcessed(state.iterations() * kNumInputs);
}

static void BM_batch_cos_sse(benchmark::State& state) {
  using namespace dispenso::fast_math;
  const auto& inputs = sinInputs();
  alignas(16) float outputs[kNumInputs];
  for (auto _ : state) {
    (void)_;
    for (size_t i = 0; i < kNumInputs; i += 4) {
      SseFloat x = _mm_loadu_ps(&inputs[i]);
      SseFloat r = cos<SseFloat>(x);
      _mm_storeu_ps(&outputs[i], r.v);
    }
    benchmark::DoNotOptimize(outputs);
  }
  state.SetItemsProcessed(state.iterations() * kNumInputs);
}

BENCHMARK(BM_batch_sinf);
BENCHMARK(BM_batch_sin_scalar);
BENCHMARK(BM_batch_sin_sse);
BENCHMARK(BM_batch_cos_scalar);
BENCHMARK(BM_batch_cos_sse);
#endif // __SSE4_1__

// --- Registrations ---

BENCHMARK(BM_acos);
BENCHMARK(BM_fastm_acos);
BENCHMARK(BM_asin);
BENCHMARK(BM_fastm_asin);
BENCHMARK(BM_atan);
BENCHMARK(BM_fastm_atan);
BENCHMARK(BM_atan2);
BENCHMARK(BM_fastm_atan2);
BENCHMARK(BM_fastm_atan2_bounds);

BENCHMARK(BM_cbrt);
BENCHMARK(BM_fastm_cbrt);
BENCHMARK(BM_fastm_cbrt_accurate);

BENCHMARK(BM_exp);
BENCHMARK(BM_fastm_exp);
BENCHMARK(BM_fastm_exp_bounds);
BENCHMARK(BM_fastm_exp_accurate);
BENCHMARK(BM_exp10);
BENCHMARK(BM_fastm_exp10);
BENCHMARK(BM_fastm_exp10_accurate);
BENCHMARK(BM_exp2);
BENCHMARK(BM_fastm_exp2);
BENCHMARK(BM_fastm_exp2_accurate);

BENCHMARK(BM_log);
BENCHMARK(BM_fastm_log);
BENCHMARK(BM_fastm_log_accurate);
BENCHMARK(BM_log2);
BENCHMARK(BM_fastm_log2);
BENCHMARK(BM_fastm_log2_accurate);
BENCHMARK(BM_log10);
BENCHMARK(BM_fastm_log10);
BENCHMARK(BM_fastm_log10_accurate);

BENCHMARK(BM_sin);
BENCHMARK(BM_fastm_sin);
BENCHMARK(BM_fastm_sin_accurate);
BENCHMARK(BM_cos);
BENCHMARK(BM_fastm_cos);
BENCHMARK(BM_fastm_cos_accurate);

BENCHMARK(BM_frexp);
BENCHMARK(BM_fastm_frexp);
BENCHMARK(BM_ldexp);
BENCHMARK(BM_fastm_ldexp);

BENCHMARK(BM_tan);
BENCHMARK(BM_fastm_tan);
BENCHMARK(BM_fastm_tan_accurate);

BENCHMARK(BM_hypot);
BENCHMARK(BM_fastm_hypot);
BENCHMARK(BM_naive_hypot);
BENCHMARK(BM_fastm_hypot_bounds);

BENCHMARK(BM_sin_plus_cos);
BENCHMARK(BM_fastm_sin_plus_cos);
BENCHMARK(BM_fastm_sincos);
BENCHMARK(BM_fastm_sinpi);
BENCHMARK(BM_fastm_cospi);
BENCHMARK(BM_fastm_sincospi);

BENCHMARK(BM_pow);
BENCHMARK(BM_fastm_pow);
BENCHMARK(BM_fastm_pow_accurate);

BENCHMARK(BM_expm1);
BENCHMARK(BM_fastm_expm1);
BENCHMARK(BM_log1p);
BENCHMARK(BM_fastm_log1p);
BENCHMARK(BM_tanh);
BENCHMARK(BM_fastm_tanh);


================================================
FILE: benchmarks/fast_math/erf_benchmarks.cpp
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * This source code is licensed under the MIT license found in the
 * LICENSE file in the root directory of this source tree.
 */

// Benchmark: erf S16 (float, t-substitution + inline exp) vs S21 (double, pure polynomial Estrin).
// Also benchmarks libc erff for reference.

#include <cmath>
#include <cstdint>
#include <iostream>
#include <vector>

#include <benchmark/benchmark.h>
#include <dispenso/fast_math/fast_math.h>

#if defined(__SSE4_1__)
#include <immintrin.h>

#if defined(__GNUC__) || defined(__clang__)
#define UNUSED_VAR myLocalForLoopVar __attribute__((unused))
#elif defined(_MSC_VER)
#define UNUSED_VAR myLocalForLoopVar __pragma(warning(suppress : 4100))
#else
#define UNUSED_VAR myLocalForLoopVar
#endif

namespace dfm = dispenso::fast_math;

constexpr size_t kNumInputs = 4096;
constexpr size_t kInputsMask = 4095;

// --- Inputs: [-4, 4] ---

const std::vector<float>& erfScalarInputs() {
  static std::vector<float> inputs = []() {
    float delta = 8.0f / kNumInputs;
    std::vector<float> inp;
    for (float f = -4.0f; inp.size() < kNumInputs; f += delta) {
      inp.push_back(f);
    }
    return inp;
  }();
  return inputs;
}

const std::vector<__m128>& erfSseInputs() {
  static std::vector<__m128> inputs = []() {
    float delta = 8.0f / kNumInputs;
    std::vector<__m128> inp;
    float f = -4.0f;
    for (size_t i = 0; i < kNumInputs; ++i) {
      inp.emplace_back(_mm_set_ps(f + 3 * delta, f + 2 * delta, f + delta, f));
      f += 4 * delta;
    }
    return inp;
  }();
  return inputs;
}

#if defined(__AVX2__)
const std::vector<__m256>& erfAvxInputs() {
  static std::vector<__m256> inputs = []() {
    float delta = 8.0f / kNumInputs;
    std::vector<__m256> inp;
    float f = -4.0f;
    for (size_t i = 0; i < kNumInputs; ++i) {
      inp.emplace_back(_mm256_set_ps(
          f + 7 * delta,
          f + 6 * delta,
          f + 5 * delta,
          f + 4 * delta,
          f + 3 * delta,
          f + 2 * delta,
          f + delta,
          f));
      f += 8 * delta;
    }
    return inp;
  }();
  return inputs;
}
#endif

// --- S16: float, t-substitution + inline exp, 2 ULP ---

static inline float erf_s16(float x) {
  float ax = std::fabs(x);
  float result;
  if (ax >= 3.92f) {
    result = 1.0f;
  } else if (ax >= 0.875f) {
    constexpr float p = 0.45f;
    float t = 1.0f / std::fma(p, ax, 1.0f);

    constexpr float c0 = 0x1.04873ep-2f;
    constexpr float c1 = 0x1.f81fc6p-3f;
    constexpr float c2 = 0x1.189f42p-2f;
    constexpr float c3 = 0x1.15aaa6p-5f;
    constexpr float c4 = 0x1.65d24ep-2f;
    constexpr float c5 = -0x1.4432a4p-3f;

    float poly =
        t * std::fma(std::fma(std::fma(std::fma(std::fma(c5, t, c4), t, c3), t, c2), t, c1), t, c0);

    float u = ax * ax;
    constexpr float kLog2e = 0x1.715476p+0f;
    constexpr float kLn2hi = 0x1.62e400p-1f;
    constexpr float kLn2lo = 0x1.7f7d1cp-20f;
    float k = std::floor(u * kLog2e);
    float f = std::fma(k, -kLn2hi, u);
    f = std::fma(k, -kLn2lo, f);
    constexpr float e0 = 0x1.fffffep-1f, e1 = -0x1.ffff1ep-1f;
    constexpr float e2 = 0x1.ffe314p-2f, e3 = -0x1.53f876p-3f;
    constexpr float e4 = 0x1.462f16p-5f, e5 = -0x1.80e5b2p-8f;
    float exp_neg_f =
        std::fma(std::fma(std::fma(std::fma(std::fma(e5, f, e4), f, e3), f, e2), f, e1), f, e0);
    int32_t ki = static_cast<int32_t>(k);
    float pow2_neg_k = dfm::bit_cast<float>((127 - ki) << 23);

    result = 1.0f - pow2_neg_k * exp_neg_f * poly;
  } else {
    constexpr float c0 = 0x1.20dd76p+0f;
    constexpr float q0 = -0x1.812746p-2f, q1 = 0x1.ce2ec6p-4f, q2 = -0x1.b81edep-6f;
    constexpr float q3 = 0x1.556b48p-8f, q4 = -0x1.b0255p-11f, q5 = 0x1.7149c8p-14f;
    float u = ax * ax;
    float q =
        std::fma(std::fma(std::fma(std::fma(std::fma(q5, u, q4), u, q3), u, q2), u, q1), u, q0);
    result = ax * std::fma(q, u, c0);
  }
  return x < 0.0f ? -result : result;
}

// --- S21: double, pure polynomial Estrin, 1 ULP ---

static inline float erf_s21(float x) {
  double ax = std::fabs(static_cast<double>(x));
  double result;
  if (ax >= 3.92) {
    result = 1.0;
  } else {
    double u = ax * ax;

    constexpr double c0 = 0x1.20dd74ce6dac1p0;
    constexpr double c1 = -0x1.812728fedb0c3p-2;
    constexpr double c2 = 0x1.ce2c679f0f94dp-4;
    constexpr double c3 = -0x1.b81379b046993p-6;
    constexpr double c4 = 0x1.55decae500c6cp-8;
    constexpr double c5 = -0x1.bd402ca3b1d09p-11;
    constexpr double c6 = 0x1.edfbbbd68d00ep-14;
    constexpr double c7 = -0x1.d43f94bdfb90fp-17;
    constexpr double c8 = 0x1.77643daca82f5p-20;
    constexpr double c9 = -0x1.f276d5cf346ecp-24;
    constexpr double c10 = 0x1.0a42c17eedcadp-27;
    constexpr double c11 = -0x1.b999e591ae6bap-32;
    constexpr double c12 = 0x1.0f73303821975p-36;
    constexpr double c13 = -0x1.ce969a50741b3p-42;
    constexpr double c14 = 0x1.e56af7f1b38e4p-48;
    constexpr double c15 = -0x1.d6f65766c68e5p-55;

    // Estrin's scheme: 4-level tree
    double p0 = std::fma(c1, u, c0);
    double p1 = std::fma(c3, u, c2);
    double p2 = std::fma(c5, u, c4);
    double p3 = std::fma(c7, u, c6);
    double p4 = std::fma(c9, u, c8);
    double p5 = std::fma(c11, u, c10);
    double p6 = std::fma(c13, u, c12);
    double p7 = std::fma(c15, u, c14);
    double u2 = u * u;

    double q0 = std::fma(p1, u2, p0);
    double q1 = std::fma(p3, u2, p2);
    double q2 = std::fma(p5, u2, p4);
    double q3 = std::fma(p7, u2, p6);
    double u4 = u2 * u2;

    double r0 = std::fma(q1, u4, q0);
    double r1 = std::fma(q3, u4, q2);
    double u8 = u4 * u4;

    double R = std::fma(r1, u8, r0);

    result = ax * R;
  }
  return static_cast<float>(x < 0.0f ? -result : result);
}

// --- SSE S16 (float4, t-substitution + inline exp) ---

static inline __m128 erf_s16_sse(__m128 x) {
  __m128 ax = dfm::fabs(x);
  __m128 sign = _mm_and_ps(x, _mm_set1_ps(-0.0f));

  // Near-zero path: erf(x) = x * (c0 + u * Q(u))
  __m128 u = _mm_mul_ps(ax, ax);
  constexpr float nc0 = 0x1.20dd76p+0f;
  constexpr float nq0 = -0x1.812746p-2f, nq1 = 0x1.ce2ec6p-4f, nq2 = -0x1.b81edep-6f;
  constexpr float nq3 = 0x1.556b48p-8f, nq4 = -0x1.b0255p-11f, nq5 = 0x1.7149c8p-14f;
  __m128 q = _mm_fmadd_ps(
      _mm_fmadd_ps(
          _mm_fmadd_ps(
              _mm_fmadd_ps(
                  _mm_fmadd_ps(_mm_set1_ps(nq5), u, _mm_set1_ps(nq4)), u, _mm_set1_ps(nq3)),
              u,
              _mm_set1_ps(nq2)),
          u,
          _mm_set1_ps(nq1)),
      u,
      _mm_set1_ps(nq0));
  __m128 near_zero = _mm_mul_ps(ax, _mm_fmadd_ps(q, u, _mm_set1_ps(nc0)));

  // Erfc path: erf(x) = 1 - t*P(t)*exp(-x^2)
  constexpr float p = 0.45f;
  __m128 t = _mm_div_ps(_mm_set1_ps(1.0f), _mm_fmadd_ps(_mm_set1_ps(p), ax, _mm_set1_ps(1.0f)));

  constexpr float pc0 = 0x1.04873ep-2f, pc1 = 0x1.f81fc6p-3f, pc2 = 0x1.189f42p-2f;
  constexpr float pc3 = 0x1.15aaa6p-5f, pc4 = 0x1.65d24ep-2f, pc5 = -0x1.4432a4p-3f;
  __m128 ppoly = _mm_mul_ps(
      t,
      _mm_fmadd_ps(
          _mm_fmadd_ps(
              _mm_fmadd_ps(
                  _mm_fmadd_ps(
                      _mm_fmadd_ps(_mm_set1_ps(pc5), t, _mm_set1_ps(pc4)), t, _mm_set1_ps(pc3)),
                  t,
                  _mm_set1_ps(pc2)),
              t,
              _mm_set1_ps(pc1)),
          t,
          _mm_set1_ps(pc0)));

  // Inline exp(-x^2): exp(-u) = 2^(-k) * exp(-f)
  constexpr float kLog2e = 0x1.715476p+0f;
  constexpr float kLn2hi = 0x1.62e400p-1f;
  constexpr float kLn2lo = 0x1.7f7d1cp-20f;
  __m128 kv = _mm_floor_ps(_mm_mul_ps(u, _mm_set1_ps(kLog2e)));
  __m128 f = _mm_fmadd_ps(kv, _mm_set1_ps(-kLn2hi), u);
  f = _mm_fmadd_ps(kv, _mm_set1_ps(-kLn2lo), f);
  constexpr float e0 = 0x1.fffffep-1f, e1 = -0x1.ffff1ep-1f;
  constexpr float e2 = 0x1.ffe314p-2f, e3 = -0x1.53f876p-3f;
  constexpr float e4 = 0x1.462f16p-5f, e5 = -0x1.80e5b2p-8f;
  __m128 exp_neg_f = _mm_fmadd_ps(
      _mm_fmadd_ps(
          _mm_fmadd_ps(
              _mm_fmadd_ps(_mm_fmadd_ps(_mm_set1_ps(e5), f, _mm_set1_ps(e4)), f, _mm_set1_ps(e3)),
              f,
              _mm_set1_ps(e2)),
          f,
          _mm_set1_ps(e1)),
      f,
      _mm_set1_ps(e0));
  __m128i ki = _mm_cvtps_epi32(kv);
  __m128 pow2_neg_k = _mm_castsi128_ps(_mm_slli_epi32(_mm_sub_epi32(_mm_set1_epi32(127), ki), 23));

  __m128 erfc_result =
      _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(_mm_mul_ps(pow2_neg_k, exp_neg_f), ppoly));

  // Blend: use near_zero for |x| < 0.875, erfc for |x| >= 0.875
  __m128 use_erfc = _mm_cmpge_ps(ax, _mm_set1_ps(0.875f));
  __m128 result = _mm_blendv_ps(near_zero, erfc_result, use_erfc);

  // Clamp to 1 for |x| >= 3.92
  __m128 saturated = _mm_cmpge_ps(ax, _mm_set1_ps(3.92f));
  result = _mm_blendv_ps(result, _mm_set1_ps(1.0f), saturated);

  // Restore sign
  return _mm_or_ps(result, sign);
}

// --- SSE S21 (double poly Estrin, process 4 floats via 2x double2) ---

static inline __m128 erf_s21_sse(__m128 x) {
  __m128 ax_f = dfm::fabs(x);
  __m128 sign = _mm_and_ps(x, _mm_set1_ps(-0.0f));

  // Convert to double: split into low 2 and high 2
  __m128d ax_lo = _mm_cvtps_pd(ax_f);
  __m128d ax_hi = _mm_cvtps_pd(_mm_movehl_ps(ax_f, ax_f));

  auto eval_poly = [](__m128d ax) -> __m128d {
    __m128d u = _mm_mul_pd(ax, ax);

    constexpr double c0 = 0x1.20dd74ce6dac1p0;
    constexpr double c1 = -0x1.812728fedb0c3p-2;
    constexpr double c2 = 0x1.ce2c679f0f94dp-4;
    constexpr double c3 = -0x1.b81379b046993p-6;
    constexpr double c4 = 0x1.55decae500c6cp-8;
    constexpr double c5 = -0x1.bd402ca3b1d09p-11;
    constexpr double c6 = 0x1.edfbbbd68d00ep-14;
    constexpr double c7 = -0x1.d43f94bdfb90fp-17;
    constexpr double c8 = 0x1.77643daca82f5p-20;
    constexpr double c9 = -0x1.f276d5cf346ecp-24;
    constexpr double c10 = 0x1.0a42c17eedcadp-27;
    constexpr double c11 = -0x1.b999e591ae6bap-32;
    constexpr double c12 = 0x1.0f73303821975p-36;
    constexpr double c13 = -0x1.ce969a50741b3p-42;
    constexpr double c14 = 0x1.e56af7f1b38e4p-48;
    constexpr double c15 = -0x1.d6f65766c68e5p-55;

    // Estrin level 0
    __m128d p0 = _mm_fmadd_pd(_mm_set1_pd(c1), u, _mm_set1_pd(c0));
    __m128d p1 = _mm_fmadd_pd(_mm_set1_pd(c3), u, _mm_set1_pd(c2));
    __m128d p2 = _mm_fmadd_pd(_mm_set1_pd(c5), u, _mm_set1_pd(c4));
    __m128d p3 = _mm_fmadd_pd(_mm_set1_pd(c7), u, _mm_set1_pd(c6));
    __m128d p4 = _mm_fmadd_pd(_mm_set1_pd(c9), u, _mm_set1_pd(c8));
    __m128d p5 = _mm_fmadd_pd(_mm_set1_pd(c11), u, _mm_set1_pd(c10));
    __m128d p6 = _mm_fmadd_pd(_mm_set1_pd(c13), u, _mm_set1_pd(c12));
    __m128d p7 = _mm_fmadd_pd(_mm_set1_pd(c15), u, _mm_set1_pd(c14));
    __m128d u2 = _mm_mul_pd(u, u);

    // Level 1
    __m128d q0 = _mm_fmadd_pd(p1, u2, p0);
    __m128d q1 = _mm_fmadd_pd(p3, u2, p2);
    __m128d q2 = _mm_fmadd_pd(p5, u2, p4);
    __m128d q3 = _mm_fmadd_pd(p7, u2, p6);
    __m128d u4 = _mm_mul_pd(u2, u2);

    // Level 2
    __m128d r0 = _mm_fmadd_pd(q1, u4, q0);
    __m128d r1 = _mm_fmadd_pd(q3, u4, q2);
    __m128d u8 = _mm_mul_pd(u4, u4);

    // Level 3
    __m128d R = _mm_fmadd_pd(r1, u8, r0);

    return _mm_mul_pd(ax, R);
  };

  __m128d res_lo = eval_poly(ax_lo);
  __m128d res_hi = eval_poly(ax_hi);

  // Convert back to float
  __m128 result = _mm_movelh_ps(_mm_cvtpd_ps(res_lo), _mm_cvtpd_ps(res_hi));

  // Clamp to 1 for |x| >= 3.92
  __m128 saturated = _mm_cmpge_ps(ax_f, _mm_set1_ps(3.92f));
  result = _mm_blendv_ps(result, _mm_set1_ps(1.0f), saturated);

  // Restore sign
  return _mm_or_ps(result, sign);
}

static void consumeSum(__m128 sum) {
  alignas(16) float buf[4];
  _mm_store_ps(buf, sum);
  std::cout << buf[0] + buf[1] + buf[2] + buf[3] << std::endl;
}

// --- Scalar benchmarks ---

void BM_erf_libc(benchmark::State& state) {
  const auto& inputs = erfScalarInputs();
  size_t idx = 0;
  float sum = 0.0f;
  for (auto UNUSED_VAR : state) {
    sum += ::erff(inputs[idx]);
    idx = (idx + 1) & kInputsMask;
  }
  state.SetItemsProcessed(state.iterations());
  std::cout << sum << std::endl;
}

void BM_erf_s16_scalar(benchmark::State& state) {
  const auto& inputs = erfScalarInputs();
  size_t idx = 0;
  float sum = 0.0f;
  for (auto UNUSED_VAR : state) {
    sum += erf_s16(inputs[idx]);
    idx = (idx + 1) & kInputsMask;
  }
  state.SetItemsProcessed(state.iterations());
  std::cout << sum << std::endl;
}

void BM_erf_s21_scalar(benchmark::State& state) {
  const auto& inputs = erfScalarInputs();
  size_t idx = 0;
  float sum = 0.0f;
  for (auto UNUSED_VAR : state) {
    sum += erf_s21(inputs[idx]);
    idx = (idx + 1) & kInputsMask;
  }
  state.SetItemsProcessed(state.iterations());
  std::cout << sum << std::endl;
}

// --- SSE benchmarks ---

void BM_erf_s16_sse(benchmark::State& state) {
  const auto& inputs = erfSseInputs();
  size_t idx = 0;
  __m128 sum = _mm_setzero_ps();
  for (auto UNUSED_VAR : state) {
    sum = _mm_add_ps(sum, erf_s16_sse(inputs[idx]));
    idx = (idx + 1) & kInputsMask;
  }
  state.SetItemsProcessed(state.iterations() * 4);
  consumeSum(sum);
}

void BM_erf_s21_sse(benchmark::State& state) {
  const auto& inputs = erfSseInputs();
  size_t idx = 0;
  __m128 sum = _mm_setzero_ps();
  for (auto UNUSED_VAR : state) {
    sum = _mm_add_ps(sum, erf_s21_sse(inputs[idx]));
    idx = (idx + 1) & kInputsMask;
  }
  state.SetItemsProcessed(state.iterations() * 4);
  consumeSum(sum);
}

BENCHMARK(BM_erf_libc);
BENCHMARK(BM_erf_s16_scalar);
BENCHMARK(BM_erf_s21_scalar);
BENCHMARK(BM_erf_s16_sse);
BENCHMARK(BM_erf_s21_sse);

#if defined(__AVX2__)

static void consumeSum256(__m256 sum) {
  alignas(32) float buf[8];
  _mm256_store_ps(buf, sum);
  float s = 0;
  for (int i = 0; i < 8; ++i)
    s += buf[i];
  std::cout << s << std::endl;
}

// --- AVX S16 ---

static inline __m256 erf_s16_avx(__m256 x) {
  __m256 ax = dfm::fabs(x);
  __m256 sign = _mm256_and_ps(x, _mm256_set1_ps(-0.0f));

  // Near-zero path
  __m256 u = _mm256_mul_ps(ax, ax);
  constexpr float nc0 = 0x1.20dd76p+0f;
  constexpr float nq0 = -0x1.812746p-2f, nq1 = 0x1.ce2ec6p-4f, nq2 = -0x1.b81edep-6f;
  constexpr float nq3 = 0x1.556b48p-8f, nq4 = -0x1.b0255p-11f, nq5 = 0x1.7149c8p-14f;
  __m256 q = _mm256_fmadd_ps(
      _mm256_fmadd_ps(
          _mm256_fmadd_ps(
              _mm256_fmadd_ps(
                  _mm256_fmadd_ps(_mm256_set1_ps(nq5), u, _mm256_set1_ps(nq4)),
                  u,
                  _mm256_set1_ps(nq3)),
              u,
              _mm256_set1_ps(nq2)),
          u,
          _mm256_set1_ps(nq1)),
      u,
      _mm256_set1_ps(nq0));
  __m256 near_zero = _mm256_mul_ps(ax, _mm256_fmadd_ps(q, u, _mm256_set1_ps(nc0)));

  // Erfc path
  constexpr float p = 0.45f;
  __m256 t = _mm256_div_ps(
      _mm256_set1_ps(1.0f), _mm256_fmadd_ps(_mm256_set1_ps(p), ax, _mm256_set1_ps(1.0f)));

  constexpr float pc0 = 0x1.04873ep-2f, pc1 = 0x1.f81fc6p-3f, pc2 = 0x1.189f42p-2f;
  constexpr float pc3 = 0x1.15aaa6p-5f, pc4 = 0x1.65d24ep-2f, pc5 = -0x1.4432a4p-3f;
  __m256 ppoly = _mm256_mul_ps(
      t,
      _mm256_fmadd_ps(
          _mm256_fmadd_ps(
              _mm256_fmadd_ps(
                  _mm256_fmadd_ps(
                      _mm256_fmadd_ps(_mm256_set1_ps(pc5), t, _mm256_set1_ps(pc4)),
                      t,
                      _mm256_set1_ps(pc3)),
                  t,
                  _mm256_set1_ps(pc2)),
              t,
              _mm256_set1_ps(pc1)),
          t,
          _mm256_set1_ps(pc0)));

  // Inline exp(-x^2)
  constexpr float kLog2e = 0x1.715476p+0f;
  constexpr float kLn2hi = 0x1.62e400p-1f;
  constexpr float kLn2lo = 0x1.7f7d1cp-20f;
  __m256 kv = _mm256_floor_ps(_mm256_mul_ps(u, _mm256_set1_ps(kLog2e)));
  __m256 f = _mm256_fmadd_ps(kv, _mm256_set1_ps(-kLn2hi), u);
  f = _mm256_fmadd_ps(kv, _mm256_set1_ps(-kLn2lo), f);
  constexpr float e0 = 0x1.fffffep-1f, e1 = -0x1.ffff1ep-1f;
  constexpr float e2 = 0x1.ffe314p-2f, e3 = -0x1.53f876p-3f;
  constexpr float e4 = 0x1.462f16p-5f, e5 = -0x1.80e5b2p-8f;
  __m256 exp_neg_f = _mm256_fmadd_ps(
      _mm256_fmadd_ps(
          _mm256_fmadd_ps(
              _mm256_fmadd_ps(
                  _mm256_fmadd_ps(_mm256_set1_ps(e5), f, _mm256_set1_ps(e4)),
                  f,
                  _mm256_set1_ps(e3)),
              f,
              _mm256_set1_ps(e2)),
          f,
          _mm256_set1_ps(e1)),
      f,
      _mm256_set1_ps(e0));
  __m256i ki = _mm256_cvtps_epi32(kv);
  __m256 pow2_neg_k =
      _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_sub_epi32(_mm256_set1_epi32(127), ki), 23));

  __m256 erfc_result = _mm256_sub_ps(
      _mm256_set1_ps(1.0f), _mm256_mul_ps(_mm256_mul_ps(pow2_neg_k, exp_neg_f), ppoly));

  // Blend
  __m256 use_erfc = _mm256_cmp_ps(ax, _mm256_set1_ps(0.875f), _CMP_GE_OQ);
  __m256 result = _mm256_blendv_ps(near_zero, erfc_result, use_erfc);

  __m256 saturated = _mm256_cmp_ps(ax, _mm256_set1_ps(3.92f), _CMP_GE_OQ);
  result = _mm256_blendv_ps(result, _mm256_set1_ps(1.0f), saturated);

  return _mm256_or_ps(result, sign);
}

// --- AVX S21 (process 8 floats via 4x double2) ---

static inline __m256 erf_s21_avx(__m256 x) {
  __m256 ax_f = dfm::fabs(x);
  __m256 sign = _mm256_and_ps(x, _mm256_set1_ps(-0.0f));

  // Split 8 floats into 4 groups of 2 doubles
  __m128 lo4 = _mm256_castps256_ps128(ax_f);
  __m128 hi4 = _mm256_extractf128_ps(ax_f, 1);
  __m128d ax_0 = _mm_cvtps_pd(lo4);
  __m128d ax_1 = _mm_cvtps_pd(_mm_movehl_ps(lo4, lo4));
  __m128d ax_2 = _mm_cvtps_pd(hi4);
  __m128d ax_3 = _mm_cvtps_pd(_mm_movehl_ps(hi4, hi4));

  auto eval_poly = [](__m128d ax) -> __m128d {
    __m128d u = _mm_mul_pd(ax, ax);
    constexpr double c0 = 0x1.20dd74ce6dac1p0;
    constexpr double c1 = -0x1.812728fedb0c3p-2;
    constexpr double c2 = 0x1.ce2c679f0f94dp-4;
    constexpr double c3 = -0x1.b81379b046993p-6;
    constexpr double c4 = 0x1.55decae500c6cp-8;
    constexpr double c5 = -0x1.bd402ca3b1d09p-11;
    constexpr double c6 = 0x1.edfbbbd68d00ep-14;
    constexpr double c7 = -0x1.d43f94bdfb90fp-17;
    constexpr double c8 = 0x1.77643daca82f5p-20;
    constexpr double c9 = -0x1.f276d5cf346ecp-24;
    constexpr double c10 = 0x1.0a42c17eedcadp-27;
    constexpr double c11 = -0x1.b999e591ae6bap-32;
    constexpr double c12 = 0x1.0f73303821975p-36;
    constexpr double c13 = -0x1.ce969a50741b3p-42;
    constexpr double c14 = 0x1.e56af7f1b38e4p-48;
    constexpr double c15 = -0x1.d6f65766c68e5p-55;

    __m128d p0 = _mm_fmadd_pd(_mm_set1_pd(c1), u, _mm_set1_pd(c0));
    __m128d p1 = _mm_fmadd_pd(_mm_set1_pd(c3), u, _mm_set1_pd(c2));
    __m128d p2 = _mm_fmadd_pd(_mm_set1_pd(c5), u, _mm_set1_pd(c4));
    __m128d p3 = _mm_fmadd_pd(_mm_set1_pd(c7), u, _mm_set1_pd(c6));
    __m128d p4 = _mm_fmadd_pd(_mm_set1_pd(c9), u, _mm_set1_pd(c8));
    __m128d p5 = _mm_fmadd_pd(_mm_set1_pd(c11), u, _mm_set1_pd(c10));
    __m128d p6 = _mm_fmadd_pd(_mm_set1_pd(c13), u, _mm_set1_pd(c12));
    __m128d p7 = _mm_fmadd_pd(_mm_set1_pd(c15), u, _mm_set1_pd(c14));
    __m128d u2 = _mm_mul_pd(u, u);

    __m128d q0 = _mm_fmadd_pd(p1, u2, p0);
    __m128d q1 = _mm_fmadd_pd(p3, u2, p2);
    __m128d q2 = _mm_fmadd_pd(p5, u2, p4);
    __m128d q3 = _mm_fmadd_pd(p7, u2, p6);
    __m128d u4 = _mm_mul_pd(u2, u2);

    __m128d r0 = _mm_fmadd_pd(q1, u4, q0);
    __m128d r1 = _mm_fmadd_pd(q3, u4, q2);
    __m128d u8 = _mm_mul_pd(u4, u4);

    __m128d R = _mm_fmadd_pd(r1, u8, r0);
    return _mm_mul_pd(ax, R);
  };

  __m128d res_0 = eval_poly(ax_0);
  __m128d res_1 = eval_poly(ax_1);
  __m128d res_2 = eval_poly(ax_2);
  __m128d res_3 = eval_poly(ax_3);

  // Convert back: 4x double2 -> 2x float4 -> 1x float8
  __m128 lo_f = _mm_movelh_ps(_mm_cvtpd_ps(res_0), _mm_cvtpd_ps(res_1));
  __m128 hi_f = _mm_movelh_ps(_mm_cvtpd_ps(res_2), _mm_cvtpd_ps(res_3));
  __m256 result = _mm256_set_m128(hi_f, lo_f);

  // Clamp
  __m256 saturated = _mm256_cmp_ps(ax_f, _mm256_set1_ps(3.92f), _CMP_GE_OQ);
  result = _mm256_blendv_ps(result, _mm256_set1_ps(1.0f), saturated);

  return _mm256_or_ps(result, sign);
}

void BM_erf_s16_avx(benchmark::State& state) {
  const auto& inputs = erfAvxInputs();
  size_t idx = 0;
  __m256 sum = _mm256_setzero_ps();
  for (auto UNUSED_VAR : state) {
    sum = _mm256_add_ps(sum, erf_s16_avx(inputs[idx]));
    idx = (idx + 1) & kInputsMask;
  }
  state.SetItemsProcessed(state.iterations() * 8);
  consumeSum256(sum);
}

void BM_erf_s21_avx(benchmark::State& state) {
  const auto& inputs = erfAvxInputs();
  size_t idx = 0;
  __m256 sum = _mm256_setzero_ps();
  for (auto UNUSED_VAR : state) {
    sum = _mm256_add_ps(sum, erf_s21_avx(inputs[idx]));
    idx = (idx + 1) & kInputsMask;
  }
  state.SetItemsProcessed(state.iterations() * 8);
  consumeSum256(sum);
}

BENCHMARK(BM_erf_s16_avx);
BENCHMARK(BM_erf_s21_avx);

#endif // __AVX2__

#else // !__SSE4_1__

int main() {
  std::cout << "SSE4.1 not available, skipping benchmarks." << std::endl;
  return 0;
}

#endif // __SSE4_1__


================================================
FILE: benchmarks/fast_math/hwy_benchmarks.cpp
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * This source code is licensed under the MIT license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include "benchmark_helpers.h"

#if __has_include("hwy/highway.h")
#include "hwy/contrib/math/math-inl.h"

namespace dfm = dispenso::fast_math;
namespace bench = dispenso::fast_math::bench;
namespace hn = hwy::HWY_NAMESPACE;
using Flt = dfm::HwyFloat;
using HwyFloatTag = dfm::HwyFloatTag;

struct BoundsTraits {
  static constexpr bool kMaxAccuracy = false;
  static constexpr bool kBoundsValues = true;
};

// --- One-arg benchmarks ---

void BM_sin_hwy(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](auto x) { return dfm::sin(x); });
}
void BM_sin_hwy_accurate(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](auto x) {
    return dfm::sin<Flt, dfm::MaxAccuracyTraits>(x);
  });
}
void BM_cos_hwy(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](auto x) { return dfm::cos(x); });
}
void BM_cos_hwy_accurate(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](auto x) {
    return dfm::cos<Flt, dfm::MaxAccuracyTraits>(x);
  });
}
void BM_tan_hwy(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](auto x) { return dfm::tan(x); });
}
void BM_atan_hwy(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](auto x) { return dfm::atan(x); });
}
void BM_acos_hwy(benchmark::State& state) {
  bench::runBench(state, bench::acosInputs<Flt>(), [](auto x) { return dfm::acos(x); });
}
void BM_asin_hwy(benchmark::State& state) {
  bench::runBench(state, bench::acosInputs<Flt>(), [](auto x) { return dfm::asin(x); });
}

void BM_exp_hwy(benchmark::State& state) {
  bench::runBench(state, bench::expInputs<Flt>(), [](auto x) { return dfm::exp(x); });
}
void BM_exp_hwy_accurate(benchmark::State& state) {
  bench::runBench(state, bench::expInputs<Flt>(), [](auto x) {
    return dfm::exp<Flt, dfm::MaxAccuracyTraits>(x);
  });
}
void BM_exp_hwy_bounds(benchmark::State& state) {
  bench::runBench(
      state, bench::expInputs<Flt>(), [](auto x) { return dfm::exp<Flt, BoundsTraits>(x); });
}
void BM_exp2_hwy(benchmark::State& state) {
  bench::runBench(state, bench::expInputs<Flt>(), [](auto x) { return dfm::exp2(x); });
}
void BM_exp10_hwy(benchmark::State& state) {
  bench::runBench(state, bench::expInputs<Flt>(), [](auto x) { return dfm::exp10(x); });
}
void BM_expm1_hwy(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](auto x) { return dfm::expm1(x); });
}

void BM_log_hwy(benchmark::State& state) {
  bench::runBench(state, bench::logInputs<Flt>(), [](auto x) { return dfm::log(x); });
}
void BM_log_hwy_accurate(benchmark::State& state) {
  bench::runBench(state, bench::logInputs<Flt>(), [](auto x) {
    return dfm::log<Flt, dfm::MaxAccuracyTraits>(x);
  });
}
void BM_log2_hwy(benchmark::State& state) {
  bench::runBench(state, bench::logInputs<Flt>(), [](auto x) { return dfm::log2(x); });
}
void BM_log10_hwy(benchmark::State& state) {
  bench::runBench(state, bench::logInputs<Flt>(), [](auto x) { return dfm::log10(x); });
}
void BM_log1p_hwy(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](auto x) { return dfm::log1p(hn::Abs(x.v)); });
}

void BM_cbrt_hwy(benchmark::State& state) {
  bench::runBench(state, bench::logInputs<Flt>(), [](auto x) { return dfm::cbrt(x); });
}
void BM_cbrt_hwy_accurate(benchmark::State& state) {
  bench::runBench(state, bench::logInputs<Flt>(), [](auto x) {
    return dfm::cbrt<Flt, dfm::MaxAccuracyTraits>(x);
  });
}

void BM_frexp_hwy(benchmark::State& state) {
  bench::runBench(state, bench::logInputs<Flt>(), [](auto x) {
    dfm::IntType_t<Flt> e;
    return dfm::frexp(x, &e);
  });
}
void BM_ldexp_hwy(benchmark::State& state) {
  bench::runBench(state, bench::logInputs<Flt>(), [](auto x) {
    const hn::RebindToSigned<HwyFloatTag> di;
    return dfm::ldexp(x, hn::Set(di, 3));
  });
}

void BM_tanh_hwy(benchmark::State& state) {
  bench::runBench(state, bench::tanhInputs<Flt>(), [](auto x) { return dfm::tanh(x); });
}
void BM_erf_hwy(benchmark::State& state) {
  bench::runBench(state, bench::erfInputs<Flt>(), [](auto x) { return dfm::erf(x); });
}

// --- Two-arg benchmarks ---

void BM_atan2_hwy(benchmark::State& state) {
  bench::runBench2(state, bench::expInputs<Flt>(), bench::sinInputs<Flt>(), [](auto y, auto x) {
    return dfm::atan2(y, x);
  });
}

void BM_hypot_hwy(benchmark::State& state) {
  bench::runBench2(state, bench::hypotInputs<Flt>(), bench::sinInputs<Flt>(), [](auto x, auto y) {
    return dfm::hypot(x, y);
  });
}
void BM_hypot_hwy_bounds(benchmark::State& state) {
  bench::runBench2(state, bench::hypotInputs<Flt>(), bench::sinInputs<Flt>(), [](auto x, auto y) {
    return dfm::hypot<Flt, dfm::MaxAccuracyTraits>(x, y);
  });
}

// --- Highway contrib/math comparison benchmarks ---

void BM_sin_hwy_contrib(benchmark::State& state) {
  bench::runBench(
      state, bench::sinInputs<Flt>(), [](auto x) { return hn::Sin(HwyFloatTag(), x.v); });
}
void BM_cos_hwy_contrib(benchmark::State& state) {
  bench::runBench(
      state, bench::sinInputs<Flt>(), [](auto x) { return hn::Cos(HwyFloatTag(), x.v); });
}
void BM_exp_hwy_contrib(benchmark::State& state) {
  bench::runBench(
      state, bench::expInputs<Flt>(), [](auto x) { return hn::Exp(HwyFloatTag(), x.v); });
}
void BM_exp2_hwy_contrib(benchmark::State& state) {
  bench::runBench(
      state, bench::expInputs<Flt>(), [](auto x) { return hn::Exp2(HwyFloatTag(), x.v); });
}
void BM_log_hwy_contrib(benchmark::State& state) {
  bench::runBench(
      state, bench::logInputs<Flt>(), [](auto x) { return hn::Log(HwyFloatTag(), x.v); });
}
void BM_log2_hwy_contrib(benchmark::State& state) {
  bench::runBench(
      state, bench::logInputs<Flt>(), [](auto x) { return hn::Log2(HwyFloatTag(), x.v); });
}
void BM_log10_hwy_contrib(benchmark::State& state) {
  bench::runBench(
      state, bench::logInputs<Flt>(), [](auto x) { return hn::Log10(HwyFloatTag(), x.v); });
}
void BM_atan_hwy_contrib(benchmark::State& state) {
  bench::runBench(
      state, bench::sinInputs<Flt>(), [](auto x) { return hn::Atan(HwyFloatTag(), x.v); });
}
void BM_acos_hwy_contrib(benchmark::State& state) {
  bench::runBench(
      state, bench::acosInputs<Flt>(), [](auto x) { return hn::Acos(HwyFloatTag(), x.v); });
}
void BM_asin_hwy_contrib(benchmark::State& state) {
  bench::runBench(
      state, bench::acosInputs<Flt>(), [](auto x) { return hn::Asin(HwyFloatTag(), x.v); });
}
void BM_atan2_hwy_contrib(benchmark::State& state) {
  bench::runBench2(state, bench::expInputs<Flt>(), bench::sinInputs<Flt>(), [](auto y, auto x) {
    return hn::Atan2(HwyFloatTag(), y.v, x.v);
  });
}

// --- Registrations ---

BENCHMARK(BM_sin_hwy);
BENCHMARK(BM_sin_hwy_accurate);
BENCHMARK(BM_sin_hwy_contrib);
BENCHMARK(BM_cos_hwy);
BENCHMARK(BM_cos_hwy_accurate);
BENCHMARK(BM_cos_hwy_contrib);
BENCHMARK(BM_tan_hwy);
BENCHMARK(BM_atan_hwy);
BENCHMARK(BM_atan_hwy_contrib);
BENCHMARK(BM_acos_hwy);
BENCHMARK(BM_acos_hwy_contrib);
BENCHMARK(BM_asin_hwy);
BENCHMARK(BM_asin_hwy_contrib);
BENCHMARK(BM_exp_hwy);
BENCHMARK(BM_exp_hwy_accurate);
BENCHMARK(BM_exp_hwy_bounds);
BENCHMARK(BM_exp_hwy_contrib);
BENCHMARK(BM_exp2_hwy);
BENCHMARK(BM_exp2_hwy_contrib);
BENCHMARK(BM_exp10_hwy);
BENCHMARK(BM_expm1_hwy);
BENCHMARK(BM_log_hwy);
BENCHMARK(BM_log_hwy_accurate);
BENCHMARK(BM_log_hwy_contrib);
BENCHMARK(BM_log2_hwy);
BENCHMARK(BM_log2_hwy_contrib);
BENCHMARK(BM_log10_hwy);
BENCHMARK(BM_log10_hwy_contrib);
BENCHMARK(BM_log1p_hwy);
BENCHMARK(BM_cbrt_hwy);
BENCHMARK(BM_cbrt_hwy_accurate);
BENCHMARK(BM_frexp_hwy);
BENCHMARK(BM_ldexp_hwy);
BENCHMARK(BM_tanh_hwy);
BENCHMARK(BM_erf_hwy);
BENCHMARK(BM_atan2_hwy);
BENCHMARK(BM_atan2_hwy_contrib);
BENCHMARK(BM_hypot_hwy);
BENCHMARK(BM_hypot_hwy_bounds);

#else // !__has_include("hwy/highway.h")

int main() {
  std::cout << "Highway not available, skipping benchmarks." << std::endl;
  return 0;
}

#endif // __has_include("hwy/highway.h")


================================================
FILE: benchmarks/fast_math/neon_benchmarks.cpp
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * This source code is licensed under the MIT license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include "benchmark_helpers.h"

#if defined(__aarch64__)

namespace dfm = dispenso::fast_math;
namespace bench = dispenso::fast_math::bench;
using Flt = float32x4_t;

// --- One-arg benchmarks ---

void BM_sin_neon(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](auto x) { return dfm::sin(x); });
}
void BM_cos_neon(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](auto x) { return dfm::cos(x); });
}
void BM_tan_neon(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](auto x) { return dfm::tan(x); });
}
void BM_atan_neon(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](auto x) { return dfm::atan(x); });
}
void BM_acos_neon(benchmark::State& state) {
  bench::runBench(state, bench::acosInputs<Flt>(), [](auto x) { return dfm::acos(x); });
}
void BM_asin_neon(benchmark::State& state) {
  bench::runBench(state, bench::acosInputs<Flt>(), [](auto x) { return dfm::asin(x); });
}

void BM_exp_neon(benchmark::State& state) {
  bench::runBench(state, bench::expInputs<Flt>(), [](auto x) { return dfm::exp(x); });
}
void BM_exp2_neon(benchmark::State& state) {
  bench::runBench(state, bench::expInputs<Flt>(), [](auto x) { return dfm::exp2(x); });
}
void BM_exp10_neon(benchmark::State& state) {
  bench::runBench(state, bench::expInputs<Flt>(), [](auto x) { return dfm::exp10(x); });
}
void BM_expm1_neon(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](auto x) { return dfm::expm1(x); });
}

void BM_log_neon(benchmark::State& state) {
  bench::runBench(state, bench::logInputs<Flt>(), [](auto x) { return dfm::log(x); });
}
void BM_log2_neon(benchmark::State& state) {
  bench::runBench(state, bench::logInputs<Flt>(), [](auto x) { return dfm::log2(x); });
}
void BM_log10_neon(benchmark::State& state) {
  bench::runBench(state, bench::logInputs<Flt>(), [](auto x) { return dfm::log10(x); });
}
void BM_log1p_neon(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](Flt x) { return dfm::log1p(vabsq_f32(x)); });
}

void BM_cbrt_neon(benchmark::State& state) {
  bench::runBench(state, bench::logInputs<Flt>(), [](auto x) { return dfm::cbrt(x); });
}

void BM_frexp_neon(benchmark::State& state) {
  bench::runBench(state, bench::logInputs<Flt>(), [](auto x) {
    dfm::IntType_t<Flt> e;
    return dfm::frexp(x, &e);
  });
}
void BM_ldexp_neon(benchmark::State& state) {
  bench::runBench(
      state, bench::logInputs<Flt>(), [](auto x) { return dfm::ldexp(x, vdupq_n_s32(3)); });
}

void BM_tanh_neon(benchmark::State& state) {
  bench::runBench(state, bench::tanhInputs<Flt>(), [](auto x) { return dfm::tanh(x); });
}
void BM_erf_neon(benchmark::State& state) {
  bench::runBench(state, bench::erfInputs<Flt>(), [](auto x) { return dfm::erf(x); });
}

// --- Two-arg benchmarks ---

void BM_atan2_neon(benchmark::State& state) {
  bench::runBench2(state, bench::expInputs<Flt>(), bench::sinInputs<Flt>(), [](auto y, auto x) {
    return dfm::atan2(y, x);
  });
}

void BM_hypot_neon(benchmark::State& state) {
  bench::runBench2(state, bench::hypotInputs<Flt>(), bench::sinInputs<Flt>(), [](auto x, auto y) {
    return dfm::hypot(x, y);
  });
}
void BM_hypot_neon_bounds(benchmark::State& state) {
  bench::runBench2(state, bench::hypotInputs<Flt>(), bench::sinInputs<Flt>(), [](auto x, auto y) {
    return dfm::hypot<Flt, dfm::MaxAccuracyTraits>(x, y);
  });
}

void BM_pow_neon(benchmark::State& state) {
  bench::runBench2(
      state, bench::powBaseInputs<Flt>(), bench::powExpInputs<Flt>(), [](auto b, auto e) {
        return dfm::pow(b, e);
      });
}
void BM_pow_neon_accurate(benchmark::State& state) {
  bench::runBench2(
      state, bench::powBaseInputs<Flt>(), bench::powExpInputs<Flt>(), [](auto b, auto e) {
        return dfm::pow<Flt, dfm::MaxAccuracyTraits>(b, e);
      });
}
void BM_pow_neon_scalar_exp(benchmark::State& state) {
  bench::runBench(state, bench::powBaseInputs<Flt>(), [](auto x) { return dfm::pow(x, 2.5f); });
}

// --- Libc-packed comparison (NEON-specific, kept hand-written) ---

void BM_pow_libc_neon(benchmark::State& state) {
  const auto& bases = bench::powBaseInputs<Flt>();
  const auto& exps = bench::powExpInputs<Flt>();
  size_t idx = 0;
  Flt sum = vdupq_n_f32(0.0f);
  for (auto _ : state) {
    (void)_;
    alignas(16) float x[4], y[4], r[4];
    vst1q_f32(x, bases[idx]);
    vst1q_f32(y, exps[idx]);
    for (int32_t i = 0; i < 4; ++i) {
      r[i] = ::powf(x[i], y[i]);
    }
    sum = vaddq_f32(sum, vld1q_f32(r));
    idx = (idx + 1) & bench::kInputsMask;
  }
  state.SetItemsProcessed(state.iterations() * 4);
  bench::consumeResult(sum);
}

// --- Registrations ---

BENCHMARK(BM_sin_neon);
BENCHMARK(BM_cos_neon);
BENCHMARK(BM_tan_neon);
BENCHMARK(BM_atan_neon);
BENCHMARK(BM_acos_neon);
BENCHMARK(BM_asin_neon);
BENCHMARK(BM_exp_neon);
BENCHMARK(BM_exp2_neon);
BENCHMARK(BM_exp10_neon);
BENCHMARK(BM_expm1_neon);
BENCHMARK(BM_log_neon);
BENCHMARK(BM_log2_neon);
BENCHMARK(BM_log10_neon);
BENCHMARK(BM_log1p_neon);
BENCHMARK(BM_cbrt_neon);
BENCHMARK(BM_frexp_neon);
BENCHMARK(BM_ldexp_neon);
BENCHMARK(BM_tanh_neon);
BENCHMARK(BM_erf_neon);
BENCHMARK(BM_atan2_neon);
BENCHMARK(BM_hypot_neon);
BENCHMARK(BM_hypot_neon_bounds);
BENCHMARK(BM_pow_neon);
BENCHMARK(BM_pow_neon_accurate);
BENCHMARK(BM_pow_neon_scalar_exp);
BENCHMARK(BM_pow_libc_neon);

#else // !defined(__aarch64__)

int main() {
  std::cout << "AArch64 NEON not available, skipping benchmarks." << std::endl;
  return 0;
}

#endif // defined(__aarch64__)


================================================
FILE: benchmarks/fast_math/sse_benchmarks.cpp
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * This source code is licensed under the MIT license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include "benchmark_helpers.h"

#if defined(__SSE4_1__)

namespace dfm = dispenso::fast_math;
namespace bench = dispenso::fast_math::bench;
using Flt = __m128;

struct BoundsTraits {
  static constexpr bool kMaxAccuracy = false;
  static constexpr bool kBoundsValues = true;
};

// --- One-arg benchmarks ---

void BM_sin_sse(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](auto x) { return dfm::sin(x); });
}
void BM_sin_sse_accurate(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](auto x) {
    return dfm::sin<Flt, dfm::MaxAccuracyTraits>(x);
  });
}
void BM_cos_sse(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](auto x) { return dfm::cos(x); });
}
void BM_cos_sse_accurate(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](auto x) {
    return dfm::cos<Flt, dfm::MaxAccuracyTraits>(x);
  });
}
void BM_tan_sse(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](auto x) { return dfm::tan(x); });
}
void BM_atan_sse(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](auto x) { return dfm::atan(x); });
}
void BM_acos_sse(benchmark::State& state) {
  bench::runBench(state, bench::acosInputs<Flt>(), [](auto x) { return dfm::acos(x); });
}
void BM_asin_sse(benchmark::State& state) {
  bench::runBench(state, bench::acosInputs<Flt>(), [](auto x) { return dfm::asin(x); });
}

void BM_exp_sse(benchmark::State& state) {
  bench::runBench(state, bench::expInputs<Flt>(), [](auto x) { return dfm::exp(x); });
}
void BM_exp_sse_accurate(benchmark::State& state) {
  bench::runBench(state, bench::expInputs<Flt>(), [](auto x) {
    return dfm::exp<Flt, dfm::MaxAccuracyTraits>(x);
  });
}
void BM_exp_sse_bounds(benchmark::State& state) {
  bench::runBench(
      state, bench::expInputs<Flt>(), [](auto x) { return dfm::exp<Flt, BoundsTraits>(x); });
}
void BM_exp2_sse(benchmark::State& state) {
  bench::runBench(state, bench::expInputs<Flt>(), [](auto x) { return dfm::exp2(x); });
}
void BM_exp10_sse(benchmark::State& state) {
  bench::runBench(state, bench::expInputs<Flt>(), [](auto x) { return dfm::exp10(x); });
}
void BM_expm1_sse(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](auto x) { return dfm::expm1(x); });
}

void BM_log_sse(benchmark::State& state) {
  bench::runBench(state, bench::logInputs<Flt>(), [](auto x) { return dfm::log(x); });
}
void BM_log_sse_accurate(benchmark::State& state) {
  bench::runBench(state, bench::logInputs<Flt>(), [](auto x) {
    return dfm::log<Flt, dfm::MaxAccuracyTraits>(x);
  });
}
void BM_log2_sse(benchmark::State& state) {
  bench::runBench(state, bench::logInputs<Flt>(), [](auto x) { return dfm::log2(x); });
}
void BM_log10_sse(benchmark::State& state) {
  bench::runBench(state, bench::logInputs<Flt>(), [](auto x) { return dfm::log10(x); });
}
void BM_log1p_sse(benchmark::State& state) {
  bench::runBench(state, bench::sinInputs<Flt>(), [](Flt x) {
    Flt ax = _mm_andnot_ps(_mm_set1_ps(-0.0f), x);
    return dfm::log1p(ax);
  });
}

void BM_cbrt_sse(benchmark::State& state) {
  bench::runBench(state, bench::logInputs<Flt>(), [](auto x) { return dfm::cbrt(x); });
}
void BM_cbrt_sse_accurate(benchmark::State& state) {
  bench::runBench(state, bench::logInputs<Flt>(), [](auto x) {
    return dfm::cbrt<Flt, dfm::MaxAccuracyTraits>(x);
  });
}

void BM_frexp_sse(benchmark::State& state) {
  bench::runBench(state, bench::logInputs<Flt>(), [](auto x) {
    dfm::IntType_t<Flt> e;
    return dfm::frexp(x, &e);
  });
}
void BM_ldexp_sse(benchmark::State& state) {
  bench::runBench(
      state, bench::logInputs<Flt>(), [](auto x) { return dfm::ldexp(x, _mm_set1_epi32(3)); });
}

void BM_tanh_sse(benchmark::State& state) {
  bench::runBench(state, bench::tanhInputs<Flt>(), [](auto x) { return dfm::tanh(x); });
}
void BM_erf_sse(benchmark::State& state) {
  bench::runBench(state, bench::erfInputs<Flt>(), [](auto x) { return dfm::erf(x); });
}

// --- Two-arg benchmarks ---

void BM_atan2_sse(benchmark::State& state) {
  bench::runBench2(state, bench::expInputs<Flt>(), bench::sinInputs<Flt>(), [](auto y, auto x) {
    return dfm::atan2(y, x);
  });
}

void BM_hypot_sse(benchmark::State& state) {
  bench::runBench2(state, bench::hypotInputs<Flt>(), bench::sinInputs<Flt>(), [](auto x, auto y) {
    return dfm::hypot(x, y);
  });
}
void BM_hypot_sse_bounds(benchmark::State& state) {
  bench::runBench2(state, bench::hypotInputs<Flt>(), bench::sinInputs<Flt>(), [](auto x, auto y) {
    return dfm::hypot<Flt, dfm::MaxAccuracyTraits>(x, y);
  });
}

void BM_pow_sse(benchmark::State& state) {
  bench::runBench2(
      state, bench::powBaseInputs<Flt>(), bench::powExpInputs<Flt>(), [](auto b, auto e) {
        return dfm::pow(b, e);
      });
}
void BM_pow_sse_accurate(benchmark::State& state) {
  bench::runBench2(
      state, bench::powBaseInputs<Flt>(), bench::powExpInputs<Flt>(), [](auto b, auto e) {
        return dfm::pow<Flt, dfm::MaxAccuracyTraits>(b, e);
      });
}
void BM_pow_sse_scalar_exp(benchmark::State& state) {
  bench::runBench(state, bench::powBaseInputs<Flt>(), [](auto x) { return dfm::pow(x, 2.5f); });
}

// --- Libc-packed comparisons (SSE-specific, kept hand-written) ---

void BM_hypot_libc(benchmark::State& state) {
  const auto& inputs = bench::hypotInputs<Flt>();
  const auto& inputs2 = bench::sinInputs<Flt>();
  size_t idx = 0;
  Flt sum = _mm_setzero_ps();
  for (auto _ : state) {
    (void)_;
    alignas(16) float x[4], y[4], r[4];
    _mm_store_ps(x, inputs[idx]);
    _mm_store_ps(y, inputs2[idx]);
    r[0] = ::hypotf(x[0], y[0]);
    r[1] = ::hypotf(x[1], y[1]);
    r[2] = ::hypotf(x[2], y[2]);
    r[3] = ::hypotf(x[3], y[3]);
    sum = _mm_add_ps(sum, _mm_load_ps(r));
    idx = (idx + 1) & bench::kInputsMask;
  }
  state.SetItemsProcessed(state.iterations() * 4);
  bench::consumeResult(sum);
}

void BM_pow_libc_sse(benchmark::State& state) {
  const auto& bases = bench::powBaseInputs<Flt>();
  const auto& exps = bench::powExpInputs<Flt>();
  size_t idx = 0;
  Flt sum = _mm_setzero_ps();
  for (auto _ : state) {
    (void)_;
    alignas(16) float x[4], y[4], r[4];
    _mm_store_ps(x, bases[idx]);
    _mm_store_ps(y, exps[idx]);
    r[0] = ::powf(x[0], y[0]);
    r[1] = ::powf(x[1], y[1]);
    r[2] = ::powf(x[2], y[2]);
    r[3] = ::powf(x[3], y[3]);
    sum = _mm_add_ps(sum, _mm_load_ps(r));
    idx = (idx + 1) & bench::kInputsMask;
  }
  state.SetItemsProcessed(state.iterations() * 4);
  bench::consumeResult(sum);
}

// --- Registrations ---

BENCHMARK(BM_sin_sse);
BENCHMARK(BM_sin_sse_accurate);
BENCHMARK(BM_cos_sse);
BENCHMARK(BM_cos_sse_accurate);
BENCHMARK(BM_tan_sse);
BENCHMARK(BM_atan_sse);
BENCHMARK(BM_acos_sse);
BENCHMARK(BM_asin_sse);
BENCHMARK(BM_exp_sse);
BENCHMARK(BM_exp_sse_accurate);
BENCHMARK(BM_exp_sse_bounds);
BENCHMARK(BM_exp2_sse);
BENCHMARK(BM_exp10_sse);
BENCHMARK(BM_expm1_sse);
BENCHMARK(BM_log_sse);
BENCHMARK(BM_log_sse_accurate);
BENCHMARK(BM_log2_sse);
BENCHMARK(BM_log10_sse);
BENCHMARK(BM_log1p_sse);
BENCHMARK(BM_cbrt_sse);
BENCHMARK(BM_cbrt_sse_accurate);
BENCHMARK(BM_frexp_sse);
BENCHMARK(BM_ldexp_sse);
BENCHMARK(BM_tanh_sse);
BENCHMARK(BM_erf_sse);
BENCHMARK(BM_atan2_sse);
BENCHMARK(BM_hypot_sse);
BENCHMARK(BM_hypot_sse_bounds);
BENCHMARK(BM_hypot_libc);
BENCHMARK(BM_pow_sse);
BENCHMARK(BM_pow_sse_accurate);
BENCHMARK(BM_pow_sse_scalar_exp);
BENCHMARK(BM_pow_libc_sse);

#else // !defined(__SSE4_1__)

int main() {
  std::cout << "SSE4.1 not available, skipping benchmarks." << std::endl;
  return 0;
}

#endif // defined(__SSE4_1__)


================================================
FILE: benchmarks/for_each_benchmark.cpp
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * This source code is licensed under the MIT license found in the
 * LICENSE file in the root directory of this source tree.
 */

/**
 * Benchmarks for dispenso::for_each_n / dispenso::for_each.
 * Tests scheduling overhead across different container/iterator types:
 *   - vector (random-access)
 *   - deque (random-access)
 *   - list (bidirectional)
 *   - set (bidirectional, const elements)
 */

#include <dispenso/for_each.h>

#include <deque>
#include <list>
#include <set>
#include <unordered_map>

#include "thread_benchmark_common.h"

static uint32_t kSeed(8);
static constexpr int kSmallSize = 1000;
static constexpr int kMediumSize = 1000000;
static constexpr int kLargeSize = 100000000;

const std::vector<int>& getInputs(int num_elements) {
  static std::unordered_map<int, std::vector<int>> vecs;
  auto it = vecs.find(num_elements);
  if (it != vecs.end()) {
    return it->second;
  }
  srand(kSeed);
  std::vector<int> values;
  values.reserve(num_elements);
  for (int i = 0; i < num_elements; ++i) {
    values.push_back((rand() & 255) - 127);
  }
  auto res = vecs.emplace(num_elements, std::move(values));
  assert(res.second);
  return res.first->second;
}

void checkResults(const std::vector<int>& input, const std::vector<int>& output) {
  for (size_t i = 0; i < input.size(); ++i) {
    if (output[i] != input[i] * input[i] - 3 * input[i]) {
      std::cerr << "FAIL! " << output[i] << " vs " << input[i] * input[i] - 3 * input[i]
                << std::endl;
      abort();
    }
  }
}

template <int num_elements>
void BM_serial(benchmark::State& state) {
  std::vector<int> output(num_elements, 0);
  auto& input = getInputs(num_elements);

  for (auto UNUSED_VAR : state) {
    for (size_t i = 0; i < num_elements; ++i) {
      output[i] = input[i] * input[i] - 3 * input[i];
    }
  }
}

void BM_for_each_n(benchmark::State& state) {
  const int num_threads = state.range(0) - 1;
  const int num_elements = state.range(1);

  std::vector<int> output(num_elements, 0);
  dispenso::ThreadPool pool(num_threads);

  auto& input = getInputs(num_elements);
  for (auto UNUSED_VAR : state) {
    dispenso::TaskSet tasks(pool);
    dispenso::for_each_n(
        tasks, input.begin(), static_cast<size_t>(num_elements), [&output, &input](const int& val) {
          size_t idx = static_cast<size_t>(&val - input.data());
          output[idx] = val * val - 3 * val;
        });
  }
  checkResults(input, output);
}

void BM_for_each_n_deque(benchmark::State& state) {
  const int num_threads = state.range(0) - 1;
  const int num_elements = state.range(1);

  auto& input = getInputs(num_elements);
  std::deque<int> deq(input.begin(), input.end());
  dispenso::ThreadPool pool(num_threads);

  std::atomic<int64_t> sum(0);
  for (auto UNUSED_VAR : state) {
    sum.store(0, std::memory_order_relaxed);
    dispenso::TaskSet tasks(pool);
    dispenso::for_each_n(
        tasks, deq.begin(), static_cast<size_t>(num_elements), [&sum](const int& val) {
          sum.fetch_add(val * val - 3 * val, std::memory_order_relaxed);
        });
  }
  benchmark::DoNotOptimize(sum.load());
}

void BM_for_each_n_list(benchmark::State& state) {
  const int num_threads = state.range(0) - 1;
  const int num_elements = state.range(1);

  auto& input = getInputs(num_elements);
  std::list<int> lst(input.begin(), input.end());
  dispenso::ThreadPool pool(num_threads);

  std::atomic<int64_t> sum(0);
  for (auto UNUSED_VAR : state) {
    sum.store(0, std::memory_order_relaxed);
    dispenso::TaskSet tasks(pool);
    dispenso::for_each_n(
        tasks, lst.begin(), static_cast<size_t>(num_elements), [&sum](const int& val) {
          sum.fetch_add(val * val - 3 * val, std::memory_order_relaxed);
        });
  }
  benchmark::DoNotOptimize(sum.load());
}

void BM_for_each_n_set(benchmark::State& state) {
  const int num_threads = state.range(0) - 1;
  const int num_elements = state.range(1);

  auto& input = getInputs(num_elements);
  std::set<int> s(input.begin(), input.end());
  // set deduplicates, so actual size may be smaller
  size_t actual_size = s.size();
  dispenso::ThreadPool pool(num_threads);

  std::atomic<int64_t> sum(0);
  for (auto UNUSED_VAR : state) {
    sum.store(0, std::memory_order_relaxed);
    dispenso::TaskSet tasks(pool);
    dispenso::for_each_n(tasks, s.begin(), actual_size, [&sum](const int& val) {
      sum.fetch_add(val * val - 3 * val, std::memory_order_relaxed);
    });
  }
  benchmark::DoNotOptimize(sum.load());
}

static void CustomArguments(benchmark::internal::Benchmark* b) {
  for (int j : {kSmallSize, kMediumSize, kLargeSize}) {
    for (int i : pow2HalfStepThreads()) {
      b->Args({i, j});
    }
  }
}

// Smaller argument set for containers where 100M elements is impractical
static void SmallArguments(benchmark::internal::Benchmark* b) {
  for (int j : {kSmallSize, kMediumSize}) {
    for (int i : pow2HalfStepThreads()) {
      b->Args({i, j});
    }
  }
}

BENCHMARK_TEMPLATE(BM_serial, kSmallSize);
BENCHMARK_TEMPLATE(BM_serial, kMediumSize);
BENCHMARK_TEMPLATE(BM_serial, kLargeSize);

BENCHMARK(BM_for_each_n)->Apply(CustomArguments)->UseRealTime();
BENCHMARK(BM_for_each_n_deque)->Apply(CustomArguments)->UseRealTime();
BENCHMARK(BM_for_each_n_list)->Apply(SmallArguments)->UseRealTime();
BENCHMARK(BM_for_each_n_set)->Apply(SmallArguments)->UseRealTime();

BENCHMARK_MAIN();


================================================
FILE: benchmarks/for_latency_benchmark.cpp
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * This source code is licensed under the MIT license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <dispenso/parallel_for.h>
#include <dispenso/thread_pool.h>
#include <dispenso/timing.h>

#if defined(_OPENMP)
#include <omp.h>
#endif

#include <random>
#include <unordered_map>

#if !defined(BENCHMARK_WITHOUT_TBB)
#include "tbb/blocked_range.h"
#include "tbb/parallel_for.h"
#include "tbb_compat.h"
#endif // !BENCHMARK_WITHOUT_TBB

#include "thread_benchmark_common.h"

namespace {

using namespace std::chrono_literals;

uint32_t kSeed(8);
constexpr int kSize = 50000;
constexpr auto kSleep = 30ms;
} // namespace

// Adapted from Google gtest examples
// Returns true iff n is a prime number.
bool isPrime(int n) {
  // Trivial case 1: small numbers
  if (n <= 1)
    return false;

  // Trivial case 2: even numbers
  if (n % 2 == 0)
    return n == 2;

  // Now, we have that n is odd and n >= 3.

  // Try to divide n by every odd number i, starting from 3
  for (int i = 3;; i += 2) {
    // We only have to try i up to the squre root of n
    if (i > n / i)
      break;

    // Now, we have i <= n/i < n.
    // If n is divisible by i, n is not prime.
    if (n % i == 0)
      return false;
  }

  // n has no integer factor in the range (1, n), and thus is prime.
  return true;
}

const std::vector<int>& getInputs(int numElements) {
  static std::unordered_map<int, std::vector<int>> vecs;
  auto it = vecs.find(numElements);
  if (it != vecs.end()) {
    return it->second;
  }

  std::mt19937_64 gen64(kSeed);
  std::uniform_int_distribution<> distribution(100000, 1000000);
  std::vector<int> values;
  values.reserve(numElements);
  for (int i = 0; i < numElements; ++i) {
    values.push_back(distribution(gen64));
  }
  auto res = vecs.emplace(numElements, std::move(values));
  assert(res.second);
  return res.first->second;
}

void BM_serial(benchmark::State& state) {
  std::vector<int> output(kSize, 0);
  auto& input = getInputs(kSize);

  std::vector<double> times;
  times.reserve(1000);

  for (auto UNUSED_VAR : state) {
    std::this_thread::sleep_for(kSleep);
    times.push_back(dispenso::getTime());
    for (size_t i = 0; i < kSize; ++i) {
      output[i] = isPrime(input[i]);
    }
    times.back() = dispenso::getTime() - times.back();
  }

  doStats(times, state);
}

void BM_dispenso(benchmark::State& state) {
  const int numThreads = state.range(0) - 1;

  std::vector<int> output(kSize, 0);
  dispenso::resizeGlobalThreadPool(numThreads);

  std::vector<double> times;
  times.reserve(1000);

  auto& input = getInputs(kSize);
  for (auto UNUSED_VAR : state) {
    std::this_thread::sleep_for(kSleep);
    times.push_back(dispenso::getTime());
    dispenso::parallel_for(
        dispenso::makeChunkedRange(0, kSize), [&input, &output](size_t i, size_t e) {
          for (; i != e; ++i) {
            output[i] = isPrime(input[i]);
          }
        });
    times.back() = dispenso::getTime() - times.back();
  }

  doStats(times, state);
}

#if defined(_OPENMP)
void BM_omp(benchmark::State& state) {
  const int numThreads = state.range(0);

  std::vector<int> output(kSize, 0);
  omp_set_num_threads(numThreads);

  std::vector<double> times;
  times.reserve(1000);

  auto& input = getInputs(kSize);
  for (auto UNUSED_VAR : state) {
    std::this_thread::sleep_for(kSleep);
    times.push_back(dispenso::getTime());
#pragma omp parallel for
    for (int i = 0; i < kSize; ++i) {
      output[i] = isPrime(input[i]);
    }
    times.back() = dispenso::getTime() - times.back();
  }
  doStats(times, state);
}
#endif /*defined(_OPENMP)*/

#if !defined(BENCHMARK_WITHOUT_TBB)
void BM_tbb(benchmark::State& state) {
  const int numThreads = state.range(0);

  std::vector<int> output(kSize, 0);

  tbb_compat::task_scheduler_init initsched(numThreads);

  std::vector<double> times;
  times.reserve(1000);

  auto& input = getInputs(kSize);
  for (auto UNUSED_VAR : state) {
    std::this_thread::sleep_for(kSleep);
    times.push_back(dispenso::getTime());
    tbb::parallel_for(
        tbb::blocked_range<size_t>(0, kSize),
        [&input, &output](const tbb::blocked_range<size_t>& r) {
          for (size_t i = r.begin(); i < r.end(); ++i) {
            output[i] = isPrime(input[i]);
          }
        });
    times.back() = dispenso::getTime() - times.back();
  }
  doStats(times, state);
}
#endif // !BENCHMARK_WITHOUT_TBB

static void CustomArguments(benchmark::internal::Benchmark* b) {
  for (int i : pow2HalfStepThreads()) {
    b->Arg(i);
  }
}

BENCHMARK(BM_serial)->UseRealTime();

#if defined(_OPENMP)
BENCHMARK(BM_omp)->Apply(CustomArguments)->UseRealTime();
#endif // OPENMP
#if !defined(BENCHMARK_WITHOUT_TBB)
BENCHMARK(BM_tbb)->Apply(CustomArguments)->UseRealTime();
#endif // !BENCHMARK_WITHOUT_TBB

BENCHMARK(BM_dispenso)->Apply(CustomArguments)->UseRealTime();

BENCHMARK_MAIN();


================================================
FILE: benchmarks/future_benchmark.cpp
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * This source code is licensed under the MIT license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <array>
#include <cmath>
#include <future>
#include <iostream>
#include <random>

#include <dispenso/future.h>

#if !defined(BENCHMARK_WITHOUT_FOLLY)
#include <folly/executors/CPUThreadPoolExecutor.h>
#include <folly/futures/Future.h>
#endif // !BENCHMARK_WITHOUT_FOLLY

#include "thread_benchmark_common.h"

constexpr size_t kSmallSize = 13;
constexpr size_t kMediumSize = 16;
constexpr size_t kLargeSize = 19;

// Note that there are many optimizations that could be made for these tree build routines.  The
// goal was to make these as apples-to-apples as possible.

struct Node {
  Node* left;
  Node* right;
  uint32_t value;

  void setValue(uint32_t unique_bitset, uint32_t modulo) {
    value = 0;
    for (uint32_t i = 0; i < 32; ++i) {
      value += unique_bitset % modulo;
      unique_bitset /= modulo;
    }
  }
};

class Allocator {
 public:
  void reset(size_t depth) {
    nodes_.resize(std::pow(2, depth) - 1);
    next_.store(0, std::memory_order_release);
  }

  Node* alloc() {
    size_t cur = next_.fetch_add(1, std::memory_order_relaxed);
    return &nodes_[cur];
  }

 private:
  std::vector<Node> nodes_;
  std::atomic<size_t> next_{0};
};

const std::vector<uint32_t>& getModulos() {
  static const std::vector<uint32_t> modulos = []() {
    std::mt19937 mt;
    std::uniform_int_distribution<> dis(2, 55);
    std::vector<uint32_t> m;
    for (size_t i = 0; i < 64; ++i) {
      m.emplace_back(dis(mt));
    }
    return m;
  }();
  return modulos;
}

uint64_t sumTree(Node*

Download .txt

gitextract_x94z546h/

├── .clang-format
├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug_report.yml
│   │   ├── config.yml
│   │   └── feature_request.yml
│   ├── PULL_REQUEST_TEMPLATE.md
│   └── workflows/
│       ├── build.yml
│       ├── codeql.yml
│       └── docs.yml
├── .gitignore
├── CHANGELOG.md
├── CMakeLists.txt
├── CMakePresets.json
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── benchmarks/
│   ├── CMakeLists.txt
│   ├── benchmark_common.h
│   ├── cascading_parallel_for_benchmark.cpp
│   ├── concurrent_vector_benchmark.cpp
│   ├── fast_math/
│   │   ├── CMakeLists.txt
│   │   ├── avx512_benchmarks.cpp
│   │   ├── avx_benchmarks.cpp
│   │   ├── benchmark_helpers.h
│   │   ├── benchmarks.cpp
│   │   ├── erf_benchmarks.cpp
│   │   ├── hwy_benchmarks.cpp
│   │   ├── neon_benchmarks.cpp
│   │   └── sse_benchmarks.cpp
│   ├── for_each_benchmark.cpp
│   ├── for_latency_benchmark.cpp
│   ├── future_benchmark.cpp
│   ├── graph_benchmark.cpp
│   ├── graph_scene_benchmark.cpp
│   ├── idle_pool_benchmark.cpp
│   ├── locality_benchmark.cpp
│   ├── nested_for_benchmark.cpp
│   ├── nested_pool_benchmark.cpp
│   ├── once_function_benchmark.cpp
│   ├── pipeline_benchmark.cpp
│   ├── pool_allocator_benchmark.cpp
│   ├── run_benchmarks.py
│   ├── rw_lock_benchmark.cpp
│   ├── simple_for_benchmark.cpp
│   ├── simple_pool_benchmark.cpp
│   ├── small_buffer_benchmark.cpp
│   ├── summing_for_benchmark.cpp
│   ├── tbb_compat.h
│   ├── thread_benchmark_common.h
│   ├── timed_task_benchmark.cpp
│   └── trivial_compute_benchmark.cpp
├── cmake/
│   └── DispensoConfig.cmake.in
├── codecov.yml
├── dispenso/
│   ├── CMakeLists.txt
│   ├── async_request.h
│   ├── completion_event.h
│   ├── concurrent_object_arena.h
│   ├── concurrent_vector.h
│   ├── detail/
│   │   ├── can_invoke.h
│   │   ├── completion_event_impl.h
│   │   ├── concurrent_vector_impl.h
│   │   ├── concurrent_vector_impl2.h
│   │   ├── epoch_waiter.h
│   │   ├── future_impl.h
│   │   ├── future_impl2.h
│   │   ├── graph_executor_impl.h
│   │   ├── math.h
│   │   ├── notifier_common.h
│   │   ├── once_callable_impl.h
│   │   ├── op_result.h
│   │   ├── per_thread_info.cpp
│   │   ├── per_thread_info.h
│   │   ├── pipeline_impl.h
│   │   ├── quanta.cpp
│   │   ├── quanta.h
│   │   ├── result_of.h
│   │   ├── rw_lock_impl.h
│   │   ├── small_buffer_allocator_impl.h
│   │   ├── task_set_impl.h
│   │   └── timed_task_impl.h
│   ├── dispenso.h
│   ├── fast_math/
│   │   ├── README.md
│   │   ├── detail/
│   │   │   ├── double_promote.h
│   │   │   └── fast_math_impl.h
│   │   ├── fast_math.h
│   │   ├── float_traits.h
│   │   ├── float_traits_avx.h
│   │   ├── float_traits_avx512.h
│   │   ├── float_traits_hwy.h
│   │   ├── float_traits_neon.h
│   │   ├── float_traits_x86.h
│   │   ├── simd.h
│   │   └── util.h
│   ├── for_each.h
│   ├── future.h
│   ├── graph.cpp
│   ├── graph.h
│   ├── graph_executor.cpp
│   ├── graph_executor.h
│   ├── latch.h
│   ├── once_function.h
│   ├── parallel_for.h
│   ├── pipeline.h
│   ├── platform.h
│   ├── pool_allocator.cpp
│   ├── pool_allocator.h
│   ├── priority.cpp
│   ├── priority.h
│   ├── resource_pool.h
│   ├── rw_lock.h
│   ├── schedulable.h
│   ├── small_buffer_allocator.cpp
│   ├── small_buffer_allocator.h
│   ├── small_vector.h
│   ├── spsc_ring_buffer.h
│   ├── task_set.cpp
│   ├── task_set.h
│   ├── third-party/
│   │   └── moodycamel/
│   │       ├── LICENSE.md
│   │       ├── README.txt
│   │       ├── blockingconcurrentqueue.h
│   │       ├── concurrentqueue.h
│   │       └── lightweightsemaphore.h
│   ├── thread_id.cpp
│   ├── thread_id.h
│   ├── thread_pool.cpp
│   ├── thread_pool.h
│   ├── timed_task.cpp
│   ├── timed_task.h
│   ├── timing.cpp
│   ├── timing.h
│   ├── tsan_annotations.cpp
│   ├── tsan_annotations.h
│   ├── util.h
│   └── utils/
│       └── graph_dot.h
├── docs/
│   ├── Doxyfile
│   ├── benchmarks/
│   │   ├── benchmark_results.md
│   │   ├── concurrent_vector_details.md
│   │   ├── concurrent_vector_tcmalloc_details.md
│   │   ├── for_latency_details.md
│   │   ├── future_details.md
│   │   ├── graph_details.md
│   │   ├── graph_scene_details.md
│   │   ├── idle_pool_details.md
│   │   ├── index.html
│   │   ├── nested_for_details.md
│   │   ├── nested_pool_details.md
│   │   ├── once_function_details.md
│   │   ├── pipeline_details.md
│   │   ├── pool_allocator_details.md
│   │   ├── rw_lock_details.md
│   │   ├── simple_for_details.md
│   │   ├── simple_pool_details.md
│   │   ├── small_buffer_details.md
│   │   ├── summing_for_details.md
│   │   ├── timed_task_details.md
│   │   └── trivial_compute_details.md
│   ├── building.md
│   ├── custom.css
│   ├── design/
│   │   ├── barrier_dispatch.md
│   │   ├── coroutines.md
│   │   ├── cpp20_concepts.md
│   │   ├── fast_math_roadmap.md
│   │   ├── parallel_algorithms.md
│   │   ├── release_checklist.md
│   │   └── roadmap.md
│   ├── getting_started.md
│   ├── groups.dox
│   ├── header.html
│   ├── mainpage.md
│   ├── migrating_from_openmp.md
│   ├── migrating_from_tbb.md
│   └── third-party/
│       └── doxygen-awesome/
│           ├── doxygen-awesome-darkmode-toggle.js
│           └── doxygen-awesome.css
├── examples/
│   ├── CMakeLists.txt
│   ├── concurrent_vector_example.cpp
│   ├── for_each_example.cpp
│   ├── future_example.cpp
│   ├── graph_example.cpp
│   ├── latch_example.cpp
│   ├── parallel_for_example.cpp
│   ├── pipeline_example.cpp
│   ├── resource_pool_example.cpp
│   └── task_set_example.cpp
├── results/
│   ├── android_arm64.json
│   ├── linux_x64.json
│   ├── macos_arm64.json
│   └── windows_x64.json
├── run_bench.bat
├── scripts/
│   ├── BENCHMARKING.md
│   ├── compare_benchmarks.py
│   ├── generate_charts.py
│   ├── generate_plotly_benchmarks.py
│   ├── run_benchmarks.py
│   ├── update_benchmarks.py
│   └── update_package_managers.py
└── tests/
    ├── CMakeLists.txt
    ├── async_request_test.cpp
    ├── chunked_for_test.cpp
    ├── completion_event_test.cpp
    ├── concurrent_object_arena_test.cpp
    ├── concurrent_vector_a_test.cpp
    ├── concurrent_vector_b_test.cpp
    ├── concurrent_vector_default_test.cpp
    ├── concurrent_vector_nocache_test.cpp
    ├── concurrent_vector_test_common.h
    ├── concurrent_vector_test_common_types.h
    ├── fast_math/
    │   ├── CMakeLists.txt
    │   ├── acos_test.cpp
    │   ├── asin_test.cpp
    │   ├── atan2_test.cpp
    │   ├── atan_test.cpp
    │   ├── avx512_test.cpp
    │   ├── avx_test.cpp
    │   ├── bivariate_ulp_eval.h
    │   ├── cbrt_test.cpp
    │   ├── cos_test.cpp
    │   ├── erf_test.cpp
    │   ├── eval.cpp
    │   ├── eval.h
    │   ├── exp10_test.cpp
    │   ├── exp2_test.cpp
    │   ├── exp_test.cpp
    │   ├── expm1_test.cpp
    │   ├── frexp_test.cpp
    │   ├── hwy_test.cpp
    │   ├── hypot_test.cpp
    │   ├── ldexp_test.cpp
    │   ├── log10_test.cpp
    │   ├── log1p_test.cpp
    │   ├── log2_test.cpp
    │   ├── log_test.cpp
    │   ├── neon_test.cpp
    │   ├── pow_test.cpp
    │   ├── pow_ulp_eval.cpp
    │   ├── simd_test_utils.h
    │   ├── sin_test.cpp
    │   ├── sincos_test.cpp
    │   ├── sinpi_test.cpp
    │   ├── sse_test.cpp
    │   ├── tan_test.cpp
    │   ├── tanh_test.cpp
    │   ├── test_main.cpp
    │   ├── ulp_eval.cpp
    │   └── util_test.cpp
    ├── for_each_test.cpp
    ├── forward_shared_pool.cpp
    ├── future_test.cpp
    ├── graph_test.cpp
    ├── greedy_for_ranges_test.cpp
    ├── greedy_for_test.cpp
    ├── latch_test.cpp
    ├── once_function_test.cpp
    ├── pipeline_test.cpp
    ├── pool_allocator_test.cpp
    ├── priority_test.cpp
    ├── resource_pool_test.cpp
    ├── rw_lock_test.cpp
    ├── shared_pool_test.cpp
    ├── small_buffer_allocator_test.cpp
    ├── small_vector_test.cpp
    ├── spsc_ring_buffer_test.cpp
    ├── task_set_test.cpp
    ├── test_tid.h
    ├── thread_id_test.cpp
    ├── thread_pool_test.cpp
    ├── timed_task_test.cpp
    ├── timing_test.cpp
    └── util_test.cpp

Download .txt

SYMBOL INDEX (2316 symbols across 188 files)

FILE: benchmarks/cascading_parallel_for_benchmark.cpp
  function compute (line 48) | inline int32_t compute(int32_t x) {
  function fuse (line 52) | inline int32_t fuse(const std::array<int32_t, kNumLoops>& values) {
  type BenchArrays (line 60) | struct BenchArrays {
  function BenchArrays (line 66) | BenchArrays& getArrays(int32_t numElements) {
  function checkResults (line 87) | void checkResults(BenchArrays& ba, int32_t numElements) {
  function BM_serial (line 103) | void BM_serial(benchmark::State& state) {
  function BM_dispenso_blocking (line 126) | void BM_dispenso_blocking(benchmark::State& state) {
  function BM_dispenso_cascaded (line 165) | void BM_dispenso_cascaded(benchmark::State& state) {
  function BM_omp (line 223) | void BM_omp(benchmark::State& state) {
  function BM_tbb (line 253) | void BM_tbb(benchmark::State& state) {
  function BM_tbb_task_group (line 289) | void BM_tbb_task_group(benchmark::State& state) {
  function CustomArguments (line 328) | static void CustomArguments(benchmark::internal::Benchmark* b) {

FILE: benchmarks/concurrent_vector_benchmark.cpp
  function checkIotaSum (line 26) | void checkIotaSum(int64_t sum) {
  function checkIotaSum (line 35) | void checkIotaSum(const Cont& c, int64_t sum) {
  function pushBackImpl (line 55) | void pushBackImpl(benchmark::State& state, ContainerInit containerInit) {
  function pushBackGrowByAlternativeTbb (line 66) | void pushBackGrowByAlternativeTbb(benchmark::State& state, ContainerInit...
  function pushBackGrowByAlternativeDispenso (line 80) | void pushBackGrowByAlternativeDispenso(benchmark::State& state, Containe...
  function BM_std_push_back_serial (line 87) | void BM_std_push_back_serial(benchmark::State& state) {
  function BM_deque_push_back_serial (line 91) | void BM_deque_push_back_serial(benchmark::State& state) {
  function BM_tbb_push_back_serial (line 96) | void BM_tbb_push_back_serial(benchmark::State& state) {
  function BM_dispenso_push_back_serial (line 101) | void BM_dispenso_push_back_serial(benchmark::State& state) {
  function BM_tbb_push_back_serial_grow_by_alternative (line 106) | void BM_tbb_push_back_serial_grow_by_alternative(benchmark::State& state) {
  function BM_dispenso_push_back_serial_grow_by_alternative (line 111) | void BM_dispenso_push_back_serial_grow_by_alternative(benchmark::State& ...
  function BM_std_push_back_serial_reserve (line 115) | void BM_std_push_back_serial_reserve(benchmark::State& state) {
  function BM_tbb_push_back_serial_reserve (line 124) | void BM_tbb_push_back_serial_reserve(benchmark::State& state) {
  function BM_dispenso_push_back_serial_reserve (line 133) | void BM_dispenso_push_back_serial_reserve(benchmark::State& state) {
  function BM_tbb_push_back_serial_grow_by_alternative_reserve (line 139) | void BM_tbb_push_back_serial_grow_by_alternative_reserve(benchmark::Stat...
  function BM_dispenso_push_back_serial_grow_by_alternative_reserve (line 148) | void BM_dispenso_push_back_serial_grow_by_alternative_reserve(benchmark:...
  function iterateImpl (line 154) | void iterateImpl(benchmark::State& state, ContainerInit containerInit) {
  function BM_std_iterate (line 170) | void BM_std_iterate(benchmark::State& state) {
  function BM_deque_iterate (line 174) | void BM_deque_iterate(benchmark::State& state) {
  function BM_tbb_iterate (line 179) | void BM_tbb_iterate(benchmark::State& state) {
  function BM_dispenso_iterate (line 184) | void BM_dispenso_iterate(benchmark::State& state) {
  type ReverseWrapper (line 189) | struct ReverseWrapper {
  function begin (line 194) | auto begin(ReverseWrapper<T> w) {
  function end (line 199) | auto end(ReverseWrapper<T> w) {
  function reverse (line 204) | ReverseWrapper<T> reverse(T&& iterable) {
  function iterateReverseImpl (line 209) | void iterateReverseImpl(benchmark::State& state, ContainerInit container...
  function BM_std_iterate_reverse (line 225) | void BM_std_iterate_reverse(benchmark::State& state) {
  function BM_deque_iterate_reverse (line 229) | void BM_deque_iterate_reverse(benchmark::State& state) {
  function BM_tbb_iterate_reverse (line 234) | void BM_tbb_iterate_reverse(benchmark::State& state) {
  function BM_dispenso_iterate_reverse (line 239) | void BM_dispenso_iterate_reverse(benchmark::State& state) {
  function lowerBoundImpl (line 244) | void lowerBoundImpl(benchmark::State& state, ContainerInit containerInit) {
  function BM_std_lower_bound (line 260) | void BM_std_lower_bound(benchmark::State& state) {
  function BM_deque_lower_bound (line 264) | void BM_deque_lower_bound(benchmark::State& state) {
  function BM_tbb_lower_bound (line 269) | void BM_tbb_lower_bound(benchmark::State& state) {
  function BM_dispenso_lower_bound (line 274) | void BM_dispenso_lower_bound(benchmark::State& state) {
  function indexImpl (line 279) | void indexImpl(benchmark::State& state, ContainerInit containerInit) {
  function BM_std_index (line 296) | void BM_std_index(benchmark::State& state) {
  function BM_deque_index (line 300) | void BM_deque_index(benchmark::State& state) {
  function BM_tbb_index (line 305) | void BM_tbb_index(benchmark::State& state) {
  function BM_dispenso_index (line 310) | void BM_dispenso_index(benchmark::State& state) {
  function randomImpl (line 315) | void randomImpl(benchmark::State& state, ContainerInit containerInit) {
  function BM_std_random (line 338) | void BM_std_random(benchmark::State& state) {
  function BM_deque_random (line 342) | void BM_deque_random(benchmark::State& state) {
  function BM_tbb_random (line 347) | void BM_tbb_random(benchmark::State& state) {
  function BM_dispenso_random (line 352) | void BM_dispenso_random(benchmark::State& state) {
  function parallelImpl (line 357) | void parallelImpl(
  function BM_std_parallel (line 368) | void BM_std_parallel(benchmark::State& state) {
  function BM_deque_parallel (line 379) | void BM_deque_parallel(benchmark::State& state) {
  function BM_tbb_parallel (line 391) | void BM_tbb_parallel(benchmark::State& state) {
  function BM_dispenso_parallel (line 399) | void BM_dispenso_parallel(benchmark::State& state) {
  function BM_std_parallel_reserve (line 406) | void BM_std_parallel_reserve(benchmark::State& state) {
  function BM_tbb_parallel_reserve (line 422) | void BM_tbb_parallel_reserve(benchmark::State& state) {
  function BM_dispenso_parallel_reserve (line 434) | void BM_dispenso_parallel_reserve(benchmark::State& state) {
  function parallelImplClear (line 442) | void parallelImplClear(
  function BM_std_parallel_clear (line 463) | void BM_std_parallel_clear(benchmark::State& state) {
  function BM_deque_parallel_clear (line 474) | void BM_deque_parallel_clear(benchmark::State& state) {
  function BM_tbb_parallel_clear (line 486) | void BM_tbb_parallel_clear(benchmark::State& state) {
  function BM_dispenso_parallel_clear (line 494) | void BM_dispenso_parallel_clear(benchmark::State& state) {
  function parallelImplGrowBy (line 502) | void parallelImplGrowBy(
  function BM_std_parallel_grow_by_10 (line 531) | void BM_std_parallel_grow_by_10(benchmark::State& state) {
  function BM_deque_parallel_grow_by_10 (line 545) | void BM_deque_parallel_grow_by_10(benchmark::State& state) {
  function BM_tbb_parallel_grow_by_10 (line 560) | void BM_tbb_parallel_grow_by_10(benchmark::State& state) {
  function BM_dispenso_parallel_grow_by_10 (line 574) | void BM_dispenso_parallel_grow_by_10(benchmark::State& state) {
  function BM_std_parallel_grow_by_100 (line 584) | void BM_std_parallel_grow_by_100(benchmark::State& state) {
  function BM_deque_parallel_grow_by_100 (line 598) | void BM_deque_parallel_grow_by_100(benchmark::State& state) {
  function BM_tbb_parallel_grow_by_100 (line 613) | void BM_tbb_parallel_grow_by_100(benchmark::State& state) {
  function BM_dispenso_parallel_grow_by_100 (line 627) | void BM_dispenso_parallel_grow_by_100(benchmark::State& state) {
  function parallelImplGrowByMax (line 638) | void parallelImplGrowByMax(
  function BM_std_parallel_grow_by_max (line 660) | void BM_std_parallel_grow_by_max(benchmark::State& state) {
  function BM_deque_parallel_grow_by_max (line 673) | void BM_deque_parallel_grow_by_max(benchmark::State& state) {
  function BM_tbb_parallel_grow_by_max (line 687) | void BM_tbb_parallel_grow_by_max(benchmark::State& state) {
  function BM_dispenso_parallel_grow_by_max (line 700) | void BM_dispenso_parallel_grow_by_max(benchmark::State& state) {

FILE: benchmarks/fast_math/avx512_benchmarks.cpp
  function BM_sin_avx512 (line 18) | void BM_sin_avx512(benchmark::State& state) {
  function BM_cos_avx512 (line 21) | void BM_cos_avx512(benchmark::State& state) {
  function BM_tan_avx512 (line 24) | void BM_tan_avx512(benchmark::State& state) {
  function BM_atan_avx512 (line 27) | void BM_atan_avx512(benchmark::State& state) {
  function BM_acos_avx512 (line 30) | void BM_acos_avx512(benchmark::State& state) {
  function BM_asin_avx512 (line 33) | void BM_asin_avx512(benchmark::State& state) {
  function BM_exp_avx512 (line 37) | void BM_exp_avx512(benchmark::State& state) {
  function BM_exp2_avx512 (line 40) | void BM_exp2_avx512(benchmark::State& state) {
  function BM_exp10_avx512 (line 43) | void BM_exp10_avx512(benchmark::State& state) {
  function BM_expm1_avx512 (line 46) | void BM_expm1_avx512(benchmark::State& state) {
  function BM_log_avx512 (line 50) | void BM_log_avx512(benchmark::State& state) {
  function BM_log2_avx512 (line 53) | void BM_log2_avx512(benchmark::State& state) {
  function BM_log10_avx512 (line 56) | void BM_log10_avx512(benchmark::State& state) {
  function BM_log1p_avx512 (line 59) | void BM_log1p_avx512(benchmark::State& state) {
  function BM_cbrt_avx512 (line 64) | void BM_cbrt_avx512(benchmark::State& state) {
  function BM_frexp_avx512 (line 68) | void BM_frexp_avx512(benchmark::State& state) {
  function BM_ldexp_avx512 (line 74) | void BM_ldexp_avx512(benchmark::State& state) {
  function BM_tanh_avx512 (line 79) | void BM_tanh_avx512(benchmark::State& state) {
  function BM_erf_avx512 (line 82) | void BM_erf_avx512(benchmark::State& state) {
  function BM_atan2_avx512 (line 88) | void BM_atan2_avx512(benchmark::State& state) {
  function BM_hypot_avx512 (line 94) | void BM_hypot_avx512(benchmark::State& state) {
  function BM_hypot_avx512_bounds (line 99) | void BM_hypot_avx512_bounds(benchmark::State& state) {
  function BM_pow_avx512 (line 105) | void BM_pow_avx512(benchmark::State& state) {
  function BM_pow_avx512_accurate (line 111) | void BM_pow_avx512_accurate(benchmark::State& state) {
  function BM_pow_avx512_scalar_exp (line 117) | void BM_pow_avx512_scalar_exp(benchmark::State& state) {
  function BM_hypot_libc_avx512 (line 123) | void BM_hypot_libc_avx512(benchmark::State& state) {
  function BM_pow_libc_avx512 (line 143) | void BM_pow_libc_avx512(benchmark::State& state) {
  function main (line 194) | int main() {

FILE: benchmarks/fast_math/avx_benchmarks.cpp
  type BoundsTraits (line 16) | struct BoundsTraits {
  function BM_sin_avx (line 23) | void BM_sin_avx(benchmark::State& state) {
  function BM_sin_avx_accurate (line 26) | void BM_sin_avx_accurate(benchmark::State& state) {
  function BM_cos_avx (line 31) | void BM_cos_avx(benchmark::State& state) {
  function BM_cos_avx_accurate (line 34) | void BM_cos_avx_accurate(benchmark::State& state) {
  function BM_tan_avx (line 39) | void BM_tan_avx(benchmark::State& state) {
  function BM_atan_avx (line 42) | void BM_atan_avx(benchmark::State& state) {
  function BM_acos_avx (line 45) | void BM_acos_avx(benchmark::State& state) {
  function BM_asin_avx (line 48) | void BM_asin_avx(benchmark::State& state) {
  function BM_exp_avx (line 52) | void BM_exp_avx(benchmark::State& state) {
  function BM_exp_avx_accurate (line 55) | void BM_exp_avx_accurate(benchmark::State& state) {
  function BM_exp_avx_bounds (line 60) | void BM_exp_avx_bounds(benchmark::State& state) {
  function BM_exp2_avx (line 64) | void BM_exp2_avx(benchmark::State& state) {
  function BM_exp10_avx (line 67) | void BM_exp10_avx(benchmark::State& state) {
  function BM_expm1_avx (line 70) | void BM_expm1_avx(benchmark::State& state) {
  function BM_log_avx (line 74) | void BM_log_avx(benchmark::State& state) {
  function BM_log_avx_accurate (line 77) | void BM_log_avx_accurate(benchmark::State& state) {
  function BM_log2_avx (line 82) | void BM_log2_avx(benchmark::State& state) {
  function BM_log10_avx (line 85) | void BM_log10_avx(benchmark::State& state) {
  function BM_log1p_avx (line 88) | void BM_log1p_avx(benchmark::State& state) {
  function BM_cbrt_avx (line 95) | void BM_cbrt_avx(benchmark::State& state) {
  function BM_cbrt_avx_accurate (line 98) | void BM_cbrt_avx_accurate(benchmark::State& state) {
  function BM_frexp_avx (line 104) | void BM_frexp_avx(benchmark::State& state) {
  function BM_ldexp_avx (line 110) | void BM_ldexp_avx(benchmark::State& state) {
  function BM_tanh_avx (line 115) | void BM_tanh_avx(benchmark::State& state) {
  function BM_erf_avx (line 118) | void BM_erf_avx(benchmark::State& state) {
  function BM_atan2_avx (line 124) | void BM_atan2_avx(benchmark::State& state) {
  function BM_hypot_avx (line 130) | void BM_hypot_avx(benchmark::State& state) {
  function BM_hypot_avx_bounds (line 135) | void BM_hypot_avx_bounds(benchmark::State& state) {
  function BM_pow_avx (line 141) | void BM_pow_avx(benchmark::State& state) {
  function BM_pow_avx_accurate (line 147) | void BM_pow_avx_accurate(benchmark::State& state) {
  function BM_pow_avx_scalar_exp (line 153) | void BM_pow_avx_scalar_exp(benchmark::State& state) {
  function BM_pow_libc_avx (line 159) | void BM_pow_libc_avx(benchmark::State& state) {
  function main (line 216) | int main() {

FILE: benchmarks/fast_math/benchmark_helpers.h
  function namespace (line 22) | namespace dispenso {

FILE: benchmarks/fast_math/benchmarks.cpp
  type BoundsTraits (line 16) | struct BoundsTraits {
  function BM_acos (line 142) | void BM_acos(benchmark::State& state) {
  function BM_asin (line 145) | void BM_asin(benchmark::State& state) {
  function BM_atan (line 148) | void BM_atan(benchmark::State& state) {
  function BM_cbrt (line 151) | void BM_cbrt(benchmark::State& state) {
  function BM_sin (line 154) | void BM_sin(benchmark::State& state) {
  function BM_cos (line 157) | void BM_cos(benchmark::State& state) {
  function BM_tan (line 160) | void BM_tan(benchmark::State& state) {
  function BM_exp (line 163) | void BM_exp(benchmark::State& state) {
  function BM_exp2 (line 166) | void BM_exp2(benchmark::State& state) {
  function BM_exp10 (line 169) | void BM_exp10(benchmark::State& state) {
  function BM_log (line 172) | void BM_log(benchmark::State& state) {
  function BM_log2 (line 175) | void BM_log2(benchmark::State& state) {
  function BM_log10 (line 178) | void BM_log10(benchmark::State& state) {
  function BM_expm1 (line 181) | void BM_expm1(benchmark::State& state) {
  function BM_log1p (line 184) | void BM_log1p(benchmark::State& state) {
  function BM_tanh (line 187) | void BM_tanh(benchmark::State& state) {
  function BM_sin_plus_cos (line 190) | void BM_sin_plus_cos(benchmark::State& state) {
  function BM_atan2 (line 194) | void BM_atan2(benchmark::State& state) {
  function BM_hypot (line 197) | void BM_hypot(benchmark::State& state) {
  function BM_pow (line 201) | void BM_pow(benchmark::State& state) {
  function BM_frexp (line 207) | void BM_frexp(benchmark::State& state) {
  function BM_ldexp (line 223) | void BM_ldexp(benchmark::State& state) {
  function BM_fastm_acos (line 238) | void BM_fastm_acos(benchmark::State& state) {
  function BM_fastm_asin (line 241) | void BM_fastm_asin(benchmark::State& state) {
  function BM_fastm_atan (line 244) | void BM_fastm_atan(benchmark::State& state) {
  function BM_fastm_cbrt (line 247) | void BM_fastm_cbrt(benchmark::State& state) {
  function BM_fastm_cbrt_accurate (line 250) | void BM_fastm_cbrt_accurate(benchmark::State& state) {
  function BM_fastm_sin (line 254) | void BM_fastm_sin(benchmark::State& state) {
  function BM_fastm_sin_accurate (line 257) | void BM_fastm_sin_accurate(benchmark::State& state) {
  function BM_fastm_cos (line 261) | void BM_fastm_cos(benchmark::State& state) {
  function BM_fastm_cos_accurate (line 264) | void BM_fastm_cos_accurate(benchmark::State& state) {
  function BM_fastm_tan (line 268) | void BM_fastm_tan(benchmark::State& state) {
  function BM_fastm_tan_accurate (line 271) | void BM_fastm_tan_accurate(benchmark::State& state) {
  function BM_fastm_exp (line 275) | void BM_fastm_exp(benchmark::State& state) {
  function BM_fastm_exp_bounds (line 278) | void BM_fastm_exp_bounds(benchmark::State& state) {
  function BM_fastm_exp_accurate (line 281) | void BM_fastm_exp_accurate(benchmark::State& state) {
  function BM_fastm_exp2 (line 285) | void BM_fastm_exp2(benchmark::State& state) {
  function BM_fastm_exp2_accurate (line 288) | void BM_fastm_exp2_accurate(benchmark::State& state) {
  function BM_fastm_exp10 (line 292) | void BM_fastm_exp10(benchmark::State& state) {
  function BM_fastm_exp10_accurate (line 295) | void BM_fastm_exp10_accurate(benchmark::State& state) {
  function BM_fastm_log (line 299) | void BM_fastm_log(benchmark::State& state) {
  function BM_fastm_log_accurate (line 302) | void BM_fastm_log_accurate(benchmark::State& state) {
  function BM_fastm_log2 (line 306) | void BM_fastm_log2(benchmark::State& state) {
  function BM_fastm_log2_accurate (line 309) | void BM_fastm_log2_accurate(benchmark::State& state) {
  function BM_fastm_log10 (line 313) | void BM_fastm_log10(benchmark::State& state) {
  function BM_fastm_log10_accurate (line 316) | void BM_fastm_log10_accurate(benchmark::State& state) {
  function BM_fastm_expm1 (line 320) | void BM_fastm_expm1(benchmark::State& state) {
  function BM_fastm_log1p (line 323) | void BM_fastm_log1p(benchmark::State& state) {
  function BM_fastm_tanh (line 326) | void BM_fastm_tanh(benchmark::State& state) {
  function BM_fastm_atan2 (line 330) | void BM_fastm_atan2(benchmark::State& state) {
  function BM_fastm_atan2_bounds (line 334) | void BM_fastm_atan2_bounds(benchmark::State& state) {
  function BM_fastm_hypot (line 339) | void BM_fastm_hypot(benchmark::State& state) {
  function BM_fastm_hypot_bounds (line 343) | void BM_fastm_hypot_bounds(benchmark::State& state) {
  function BM_naive_hypot (line 348) | void BM_naive_hypot(benchmark::State& state) {
  function BM_fastm_pow (line 352) | void BM_fastm_pow(benchmark::State& state) {
  function BM_fastm_pow_accurate (line 356) | void BM_fastm_pow_accurate(benchmark::State& state) {
  function BM_fastm_frexp (line 363) | void BM_fastm_frexp(benchmark::State& state) {
  function BM_fastm_ldexp (line 379) | void BM_fastm_ldexp(benchmark::State& state) {
  function BM_fastm_sin_plus_cos (line 394) | void BM_fastm_sin_plus_cos(benchmark::State& state) {
  function BM_fastm_sincos (line 397) | void BM_fastm_sincos(benchmark::State& state) {
  function BM_fastm_sinpi (line 404) | void BM_fastm_sinpi(benchmark::State& state) {
  function BM_fastm_cospi (line 407) | void BM_fastm_cospi(benchmark::State& state) {
  function BM_fastm_sincospi (line 410) | void BM_fastm_sincospi(benchmark::State& state) {
  function BM_batch_sinf (line 423) | static void BM_batch_sinf(benchmark::State& state) {
  function BM_batch_sin_scalar (line 436) | static void BM_batch_sin_scalar(benchmark::State& state) {
  function BM_batch_sin_sse (line 449) | static void BM_batch_sin_sse(benchmark::State& state) {
  function BM_batch_cos_scalar (line 465) | static void BM_batch_cos_scalar(benchmark::State& state) {
  function BM_batch_cos_sse (line 478) | static void BM_batch_cos_sse(benchmark::State& state) {

FILE: benchmarks/fast_math/erf_benchmarks.cpp
  function erf_s16 (line 89) | static inline float erf_s16(float x) {
  function erf_s21 (line 138) | static inline float erf_s21(float x) {
  function __m128 (line 193) | static inline __m128 erf_s16_sse(__m128 x) {
  function __m128 (line 275) | static inline __m128 erf_s21_sse(__m128 x) {
  function consumeSum (line 346) | static void consumeSum(__m128 sum) {
  function BM_erf_libc (line 354) | void BM_erf_libc(benchmark::State& state) {
  function BM_erf_s16_scalar (line 366) | void BM_erf_s16_scalar(benchmark::State& state) {
  function BM_erf_s21_scalar (line 378) | void BM_erf_s21_scalar(benchmark::State& state) {
  function BM_erf_s16_sse (line 392) | void BM_erf_s16_sse(benchmark::State& state) {
  function BM_erf_s21_sse (line 404) | void BM_erf_s21_sse(benchmark::State& state) {
  function consumeSum256 (line 424) | static void consumeSum256(__m256 sum) {
  function __m256 (line 435) | static inline __m256 erf_s16_avx(__m256 x) {
  function __m256 (line 524) | static inline __m256 erf_s21_avx(__m256 x) {
  function BM_erf_s16_avx (line 596) | void BM_erf_s16_avx(benchmark::State& state) {
  function BM_erf_s21_avx (line 608) | void BM_erf_s21_avx(benchmark::State& state) {
  function main (line 627) | int main() {

FILE: benchmarks/fast_math/hwy_benchmarks.cpp
  type BoundsTraits (line 19) | struct BoundsTraits {
  function BM_sin_hwy (line 26) | void BM_sin_hwy(benchmark::State& state) {
  function BM_sin_hwy_accurate (line 29) | void BM_sin_hwy_accurate(benchmark::State& state) {
  function BM_cos_hwy (line 34) | void BM_cos_hwy(benchmark::State& state) {
  function BM_cos_hwy_accurate (line 37) | void BM_cos_hwy_accurate(benchmark::State& state) {
  function BM_tan_hwy (line 42) | void BM_tan_hwy(benchmark::State& state) {
  function BM_atan_hwy (line 45) | void BM_atan_hwy(benchmark::State& state) {
  function BM_acos_hwy (line 48) | void BM_acos_hwy(benchmark::State& state) {
  function BM_asin_hwy (line 51) | void BM_asin_hwy(benchmark::State& state) {
  function BM_exp_hwy (line 55) | void BM_exp_hwy(benchmark::State& state) {
  function BM_exp_hwy_accurate (line 58) | void BM_exp_hwy_accurate(benchmark::State& state) {
  function BM_exp_hwy_bounds (line 63) | void BM_exp_hwy_bounds(benchmark::State& state) {
  function BM_exp2_hwy (line 67) | void BM_exp2_hwy(benchmark::State& state) {
  function BM_exp10_hwy (line 70) | void BM_exp10_hwy(benchmark::State& state) {
  function BM_expm1_hwy (line 73) | void BM_expm1_hwy(benchmark::State& state) {
  function BM_log_hwy (line 77) | void BM_log_hwy(benchmark::State& state) {
  function BM_log_hwy_accurate (line 80) | void BM_log_hwy_accurate(benchmark::State& state) {
  function BM_log2_hwy (line 85) | void BM_log2_hwy(benchmark::State& state) {
  function BM_log10_hwy (line 88) | void BM_log10_hwy(benchmark::State& state) {
  function BM_log1p_hwy (line 91) | void BM_log1p_hwy(benchmark::State& state) {
  function BM_cbrt_hwy (line 95) | void BM_cbrt_hwy(benchmark::State& state) {
  function BM_cbrt_hwy_accurate (line 98) | void BM_cbrt_hwy_accurate(benchmark::State& state) {
  function BM_frexp_hwy (line 104) | void BM_frexp_hwy(benchmark::State& state) {
  function BM_ldexp_hwy (line 110) | void BM_ldexp_hwy(benchmark::State& state) {
  function BM_tanh_hwy (line 117) | void BM_tanh_hwy(benchmark::State& state) {
  function BM_erf_hwy (line 120) | void BM_erf_hwy(benchmark::State& state) {
  function BM_atan2_hwy (line 126) | void BM_atan2_hwy(benchmark::State& state) {
  function BM_hypot_hwy (line 132) | void BM_hypot_hwy(benchmark::State& state) {
  function BM_hypot_hwy_bounds (line 137) | void BM_hypot_hwy_bounds(benchmark::State& state) {
  function BM_sin_hwy_contrib (line 145) | void BM_sin_hwy_contrib(benchmark::State& state) {
  function BM_cos_hwy_contrib (line 149) | void BM_cos_hwy_contrib(benchmark::State& state) {
  function BM_exp_hwy_contrib (line 153) | void BM_exp_hwy_contrib(benchmark::State& state) {
  function BM_exp2_hwy_contrib (line 157) | void BM_exp2_hwy_contrib(benchmark::State& state) {
  function BM_log_hwy_contrib (line 161) | void BM_log_hwy_contrib(benchmark::State& state) {
  function BM_log2_hwy_contrib (line 165) | void BM_log2_hwy_contrib(benchmark::State& state) {
  function BM_log10_hwy_contrib (line 169) | void BM_log10_hwy_contrib(benchmark::State& state) {
  function BM_atan_hwy_contrib (line 173) | void BM_atan_hwy_contrib(benchmark::State& state) {
  function BM_acos_hwy_contrib (line 177) | void BM_acos_hwy_contrib(benchmark::State& state) {
  function BM_asin_hwy_contrib (line 181) | void BM_asin_hwy_contrib(benchmark::State& state) {
  function BM_atan2_hwy_contrib (line 185) | void BM_atan2_hwy_contrib(benchmark::State& state) {
  function main (line 235) | int main() {

FILE: benchmarks/fast_math/neon_benchmarks.cpp
  function BM_sin_neon (line 18) | void BM_sin_neon(benchmark::State& state) {
  function BM_cos_neon (line 21) | void BM_cos_neon(benchmark::State& state) {
  function BM_tan_neon (line 24) | void BM_tan_neon(benchmark::State& state) {
  function BM_atan_neon (line 27) | void BM_atan_neon(benchmark::State& state) {
  function BM_acos_neon (line 30) | void BM_acos_neon(benchmark::State& state) {
  function BM_asin_neon (line 33) | void BM_asin_neon(benchmark::State& state) {
  function BM_exp_neon (line 37) | void BM_exp_neon(benchmark::State& state) {
  function BM_exp2_neon (line 40) | void BM_exp2_neon(benchmark::State& state) {
  function BM_exp10_neon (line 43) | void BM_exp10_neon(benchmark::State& state) {
  function BM_expm1_neon (line 46) | void BM_expm1_neon(benchmark::State& state) {
  function BM_log_neon (line 50) | void BM_log_neon(benchmark::State& state) {
  function BM_log2_neon (line 53) | void BM_log2_neon(benchmark::State& state) {
  function BM_log10_neon (line 56) | void BM_log10_neon(benchmark::State& state) {
  function BM_log1p_neon (line 59) | void BM_log1p_neon(benchmark::State& state) {
  function BM_cbrt_neon (line 63) | void BM_cbrt_neon(benchmark::State& state) {
  function BM_frexp_neon (line 67) | void BM_frexp_neon(benchmark::State& state) {
  function BM_ldexp_neon (line 73) | void BM_ldexp_neon(benchmark::State& state) {
  function BM_tanh_neon (line 78) | void BM_tanh_neon(benchmark::State& state) {
  function BM_erf_neon (line 81) | void BM_erf_neon(benchmark::State& state) {
  function BM_atan2_neon (line 87) | void BM_atan2_neon(benchmark::State& state) {
  function BM_hypot_neon (line 93) | void BM_hypot_neon(benchmark::State& state) {
  function BM_hypot_neon_bounds (line 98) | void BM_hypot_neon_bounds(benchmark::State& state) {
  function BM_pow_neon (line 104) | void BM_pow_neon(benchmark::State& state) {
  function BM_pow_neon_accurate (line 110) | void BM_pow_neon_accurate(benchmark::State& state) {
  function BM_pow_neon_scalar_exp (line 116) | void BM_pow_neon_scalar_exp(benchmark::State& state) {
  function BM_pow_libc_neon (line 122) | void BM_pow_libc_neon(benchmark::State& state) {
  function main (line 173) | int main() {

FILE: benchmarks/fast_math/sse_benchmarks.cpp
  type BoundsTraits (line 16) | struct BoundsTraits {
  function BM_sin_sse (line 23) | void BM_sin_sse(benchmark::State& state) {
  function BM_sin_sse_accurate (line 26) | void BM_sin_sse_accurate(benchmark::State& state) {
  function BM_cos_sse (line 31) | void BM_cos_sse(benchmark::State& state) {
  function BM_cos_sse_accurate (line 34) | void BM_cos_sse_accurate(benchmark::State& state) {
  function BM_tan_sse (line 39) | void BM_tan_sse(benchmark::State& state) {
  function BM_atan_sse (line 42) | void BM_atan_sse(benchmark::State& state) {
  function BM_acos_sse (line 45) | void BM_acos_sse(benchmark::State& state) {
  function BM_asin_sse (line 48) | void BM_asin_sse(benchmark::State& state) {
  function BM_exp_sse (line 52) | void BM_exp_sse(benchmark::State& state) {
  function BM_exp_sse_accurate (line 55) | void BM_exp_sse_accurate(benchmark::State& state) {
  function BM_exp_sse_bounds (line 60) | void BM_exp_sse_bounds(benchmark::State& state) {
  function BM_exp2_sse (line 64) | void BM_exp2_sse(benchmark::State& state) {
  function BM_exp10_sse (line 67) | void BM_exp10_sse(benchmark::State& state) {
  function BM_expm1_sse (line 70) | void BM_expm1_sse(benchmark::State& state) {
  function BM_log_sse (line 74) | void BM_log_sse(benchmark::State& state) {
  function BM_log_sse_accurate (line 77) | void BM_log_sse_accurate(benchmark::State& state) {
  function BM_log2_sse (line 82) | void BM_log2_sse(benchmark::State& state) {
  function BM_log10_sse (line 85) | void BM_log10_sse(benchmark::State& state) {
  function BM_log1p_sse (line 88) | void BM_log1p_sse(benchmark::State& state) {
  function BM_cbrt_sse (line 95) | void BM_cbrt_sse(benchmark::State& state) {
  function BM_cbrt_sse_accurate (line 98) | void BM_cbrt_sse_accurate(benchmark::State& state) {
  function BM_frexp_sse (line 104) | void BM_frexp_sse(benchmark::State& state) {
  function BM_ldexp_sse (line 110) | void BM_ldexp_sse(benchmark::State& state) {
  function BM_tanh_sse (line 115) | void BM_tanh_sse(benchmark::State& state) {
  function BM_erf_sse (line 118) | void BM_erf_sse(benchmark::State& state) {
  function BM_atan2_sse (line 124) | void BM_atan2_sse(benchmark::State& state) {
  function BM_hypot_sse (line 130) | void BM_hypot_sse(benchmark::State& state) {
  function BM_hypot_sse_bounds (line 135) | void BM_hypot_sse_bounds(benchmark::State& state) {
  function BM_pow_sse (line 141) | void BM_pow_sse(benchmark::State& state) {
  function BM_pow_sse_accurate (line 147) | void BM_pow_sse_accurate(benchmark::State& state) {
  function BM_pow_sse_scalar_exp (line 153) | void BM_pow_sse_scalar_exp(benchmark::State& state) {
  function BM_hypot_libc (line 159) | void BM_hypot_libc(benchmark::State& state) {
  function BM_pow_libc_sse (line 180) | void BM_pow_libc_sse(benchmark::State& state) {
  function main (line 239) | int main() {

FILE: benchmarks/for_each_benchmark.cpp
  function checkResults (line 48) | void checkResults(const std::vector<int>& input, const std::vector<int>&...
  function BM_serial (line 59) | void BM_serial(benchmark::State& state) {
  function BM_for_each_n (line 70) | void BM_for_each_n(benchmark::State& state) {
  function BM_for_each_n_deque (line 89) | void BM_for_each_n_deque(benchmark::State& state) {
  function BM_for_each_n_list (line 109) | void BM_for_each_n_list(benchmark::State& state) {
  function BM_for_each_n_set (line 129) | void BM_for_each_n_set(benchmark::State& state) {
  function CustomArguments (line 150) | static void CustomArguments(benchmark::internal::Benchmark* b) {
  function SmallArguments (line 159) | static void SmallArguments(benchmark::internal::Benchmark* b) {

FILE: benchmarks/for_latency_benchmark.cpp
  function isPrime (line 38) | bool isPrime(int n) {
  function BM_serial (line 84) | void BM_serial(benchmark::State& state) {
  function BM_dispenso (line 103) | void BM_dispenso(benchmark::State& state) {
  function BM_omp (line 129) | void BM_omp(benchmark::State& state) {
  function BM_tbb (line 153) | void BM_tbb(benchmark::State& state) {
  function CustomArguments (line 180) | static void CustomArguments(benchmark::internal::Benchmark* b) {

FILE: benchmarks/future_benchmark.cpp
  type Node (line 30) | struct Node {
    method setValue (line 35) | void setValue(uint32_t unique_bitset, uint32_t modulo) {
  class Allocator (line 44) | class Allocator {
    method reset (line 46) | void reset(size_t depth) {
    method Node (line 51) | Node* alloc() {
  function sumTree (line 74) | uint64_t sumTree(Node* root) {
  function checkTree (line 81) | void checkTree(Node* root, uint32_t depth, uint32_t modulo) {
  function Node (line 100) | Node* serialTree(Allocator& allocator, uint32_t depth, uint32_t bitset, ...
    method setValue (line 35) | void setValue(uint32_t unique_bitset, uint32_t modulo) {
  function BM_serial_tree (line 116) | void BM_serial_tree(benchmark::State& state) {
  function Node (line 137) | Node* stdTree(Allocator& allocator, uint32_t depth, uint32_t bitset, uin...
    method setValue (line 35) | void setValue(uint32_t unique_bitset, uint32_t modulo) {
  function BM_std_tree (line 155) | void BM_std_tree(benchmark::State& state) {
  function Node (line 192) | Node* dispensoTree(Allocator& allocator, uint32_t depth, uint32_t bitset...
    method setValue (line 35) | void setValue(uint32_t unique_bitset, uint32_t modulo) {
  function BM_dispenso_tree (line 213) | void BM_dispenso_tree(benchmark::State& state) {
  function follyTree (line 236) | folly::SemiFuture<folly::Unit> follyTree(
  function BM_folly_tree (line 267) | void BM_folly_tree(benchmark::State& state) {
  function dispensoTaskSetTree (line 288) | void dispensoTaskSetTree(
  function BM_dispenso_taskset_tree (line 319) | void BM_dispenso_taskset_tree(benchmark::State& state) {
  function dispensoTaskSetTreeBulk (line 343) | void dispensoTaskSetTreeBulk(
  function BM_dispenso_taskset_tree_bulk (line 374) | void BM_dispenso_taskset_tree_bulk(benchmark::State& state) {
  function dispensoTreeWhenAll (line 398) | dispenso::Future<Node*>
  function BM_dispenso_tree_when_all (line 424) | void BM_dispenso_tree_when_all(benchmark::State& state) {

FILE: benchmarks/graph_benchmark.cpp
  class TaskFlowBigTree (line 20) | class TaskFlowBigTree {
    method sizeOfLevel (line 22) | static size_t sizeOfLevel(size_t level) {
    method buildTree (line 25) | void buildTree() {
    method allocateMemory (line 31) | void allocateMemory() {
    method initData (line 42) | void initData() {
    method buildLevel (line 49) | void buildLevel(size_t level) {
    method testTree (line 70) | bool testTree() {
  class BigTree (line 93) | class BigTree {
    method sizeOfLevel (line 95) | static size_t sizeOfLevel(size_t level) {
    method buildTree (line 98) | void buildTree() {
    method allocateMemory (line 105) | void allocateMemory() {
    method initData (line 117) | void initData() {
    method buildLevel (line 124) | void buildLevel(size_t level) {
    method testTree (line 146) | bool testTree() {
  function BM_taskflow_build_big_tree (line 174) | static void BM_taskflow_build_big_tree(benchmark::State& state) {
  function BM_build_big_tree (line 188) | static void BM_build_big_tree(benchmark::State& state) {
  function BM_build_bi_prop_dependency_chain (line 198) | static void BM_build_bi_prop_dependency_chain(benchmark::State& state) {
  function BM_build_dependency_chain (line 212) | static void BM_build_dependency_chain(benchmark::State& state) {
  function BM_execute_dependency_chain (line 226) | static void BM_execute_dependency_chain(benchmark::State& state) {
  function BM_build_bi_prop_dependency_group (line 244) | static void BM_build_bi_prop_dependency_group(benchmark::State& state) {
  function BM_forward_propagator_node (line 262) | static void BM_forward_propagator_node(benchmark::State& state) {

FILE: benchmarks/graph_scene_benchmark.cpp
  type params (line 38) | namespace params {
  type Transform (line 52) | struct Transform {
  type Scene (line 62) | struct Scene {
  function branchlessONB (line 68) | void branchlessONB(const Vec3& n, Vec3& b1, Vec3& b2) {
  function Matrix4 (line 76) | Matrix4 getRandomTransformMatrix(std::mt19937& rng) {
  function Scene (line 94) | Scene generateTransformsHierarchy(
  function Vec3 (line 123) | Vec3 multiply(const Vec3& v, const Matrix4& m) {
  function Matrix4 (line 131) | Matrix4 multiply(const Matrix4& ma, const Matrix4& mb) {
  function calculateWorldMatrix (line 151) | void calculateWorldMatrix(std::vector<Transform>& transforms, size_t ind...
  function numGeoPoints (line 160) | size_t numGeoPoints(size_t inGeoIndex) {
  function Vec3 (line 164) | Vec3 calculateGeoPoint(size_t inGeoIndex, size_t pointIndex) {
  function generateGeoTF (line 175) | tf::Task generateGeoTF(tf::Taskflow& taskflow, std::vector<Geometry>& in...
  function transformGeoTF (line 184) | tf::Task
  function prepareGraphTF (line 194) | void prepareGraphTF(tf::Taskflow& taskflow, Scene& scene) {
  function generateGeo (line 235) | void generateGeo(
  function transformGeo (line 253) | void transformGeo(
  type Subgraphs (line 270) | struct Subgraphs {
  function Subgraphs (line 275) | Subgraphs prepareGraph(dispenso::ThreadPool& threadPool, Scene& scene, d...
  function compare (line 322) | bool compare(const std::array<float, N>& ma, const std::array<float, N>&...
  function testScene (line 330) | bool testScene(const Scene& scene) {
  function cleanScene (line 353) | void cleanScene(Scene& s) {
  function BM_scene_graph_parallel_for (line 368) | static void BM_scene_graph_parallel_for(benchmark::State& state) {
  function BM_scene_graph_concurrent_task_set (line 396) | static void BM_scene_graph_concurrent_task_set(benchmark::State& state) {
  function BM_scene_graph_partial_revaluation (line 424) | static void BM_scene_graph_partial_revaluation(benchmark::State& state) {
  function BM_scene_graph_taskflow (line 472) | static void BM_scene_graph_taskflow(benchmark::State& state) {

FILE: benchmarks/idle_pool_benchmark.cpp
  type Work (line 30) | struct alignas(64) Work {
  function testTid (line 40) | inline int testTid() {
  function Work (line 48) | inline Work& work() {
  function BM_tbb_mostly_idle (line 62) | void BM_tbb_mostly_idle(benchmark::State& state) {
  function BM_tbb_very_idle (line 96) | void BM_tbb_very_idle(benchmark::State& state) {
  function BM_dispenso_mostly_idle (line 114) | void BM_dispenso_mostly_idle(benchmark::State& state) {
  function BM_dispenso_very_idle (line 149) | void BM_dispenso_very_idle(benchmark::State& state) {
  function CustomArguments (line 165) | static void CustomArguments(benchmark::internal::Benchmark* b) {
  function CustomArgumentsVeryIdle (line 173) | static void CustomArgumentsVeryIdle(benchmark::internal::Benchmark* b) {

FILE: benchmarks/locality_benchmark.cpp
  function initArrays (line 53) | static void initArrays(std::vector<double>& input, std::vector<double>& ...
  function stencilPass (line 64) | inline void
  function checkOutput (line 74) | static void checkOutput(const double* data, size_t n) {
  function BM_serial (line 87) | void BM_serial(benchmark::State& state) {
  function BM_dispenso_static (line 100) | void BM_dispenso_static(benchmark::State& state) {
  function BM_dispenso_auto (line 125) | void BM_dispenso_auto(benchmark::State& state) {
  function BM_omp (line 151) | void BM_omp(benchmark::State& state) {
  function BM_tbb (line 178) | void BM_tbb(benchmark::State& state) {
  function CustomArguments (line 202) | static void CustomArguments(benchmark::internal::Benchmark* b) {

FILE: benchmarks/nested_for_benchmark.cpp
  function getInputs (line 33) | uint32_t getInputs(int numElements) {
  function calculate (line 38) | inline uint64_t calculate(uint64_t input, uint64_t index, size_t foo) {
  function calculateInnerSerial (line 44) | uint64_t calculateInnerSerial(uint64_t input, size_t foo, int numElement...
  function checkResults (line 52) | void checkResults(uint32_t input, uint64_t actual, int foo, size_t numEl...
  function BM_serial (line 70) | void BM_serial(benchmark::State& state) {
  function calculateInnerDispenso (line 84) | uint64_t calculateInnerDispenso(uint64_t input, size_t foo, int numEleme...
  function BM_dispenso (line 106) | void BM_dispenso(benchmark::State& state) {
  function calculateInnerDispensoAuto (line 141) | uint64_t calculateInnerDispensoAuto(uint64_t input, size_t foo, int numE...
  function BM_dispenso_auto (line 166) | void BM_dispenso_auto(benchmark::State& state) {
  function calculateInnerOmp (line 206) | uint64_t calculateInnerOmp(uint64_t input, size_t foo, int numElements) {
  function BM_omp (line 215) | void BM_omp(benchmark::State& state) {
  function calculateInnerTbb (line 239) | uint64_t calculateInnerTbb(uint64_t input, size_t foo, int numElements) {
  function BM_tbb (line 251) | void BM_tbb(benchmark::State& state) {
  function CustomArguments (line 277) | static void CustomArguments(benchmark::internal::Benchmark* b) {

FILE: benchmarks/nested_pool_benchmark.cpp
  type Work (line 37) | struct alignas(64) Work {
  function testTid (line 47) | inline int testTid() {
  function Work (line 55) | inline Work& work() {
  function BM_dispenso (line 68) | void BM_dispenso(benchmark::State& state) {
  function BM_dispenso_bulk (line 87) | void BM_dispenso_bulk(benchmark::State& state) {
  function BM_tbb (line 106) | void BM_tbb(benchmark::State& state) {
  function BM_folly (line 129) | void BM_folly(benchmark::State& state) {
  function CustomArguments (line 161) | static void CustomArguments(benchmark::internal::Benchmark* b) {

FILE: benchmarks/once_function_benchmark.cpp
  function runMoveLoop (line 24) | void runMoveLoop(benchmark::State& state, Func f) {
  class FuncConsumer (line 37) | class FuncConsumer {
    method add (line 39) | void add(Func&& f) {
    method consumeAll (line 43) | void consumeAll() {
  type Foo (line 56) | struct Foo {
    method Foo (line 57) | Foo() {
    method Foo (line 62) | Foo(Foo<kSize>&& f) {
    method Foo (line 66) | Foo(const Foo<kSize>& f) {
  function onceCall (line 78) | void onceCall(F&& f) {
  function BM_move_std_function (line 84) | void BM_move_std_function(benchmark::State& state) {
  function BM_move_once_function (line 89) | void BM_move_once_function(benchmark::State& state) {
  function BM_queue_inline_function (line 96) | void BM_queue_inline_function(benchmark::State& state) {
  function BM_queue_std_function (line 107) | void BM_queue_std_function(benchmark::State& state) {
  function BM_queue_once_function (line 118) | void BM_queue_once_function(benchmark::State& state) {

FILE: benchmarks/pipeline_benchmark.cpp
  type Work (line 36) | struct Work {
    method Work (line 37) | Work(size_t idx) : index(idx) {}
    method Work (line 39) | Work(Work&& w)
    method Work (line 42) | Work& operator=(Work&& w) {
  function Work (line 56) | Work fillImage(Work work) {
    method Work (line 37) | Work(size_t idx) : index(idx) {}
    method Work (line 39) | Work(Work&& w)
    method Work (line 42) | Work& operator=(Work&& w) {
  function Work (line 69) | Work computeGeometricMean(Work work) {
    method Work (line 37) | Work(size_t idx) : index(idx) {}
    method Work (line 39) | Work(Work&& w)
    method Work (line 42) | Work& operator=(Work&& w) {
  function tonemap (line 78) | std::unique_ptr<uint8_t[]> tonemap(Work work) {
  function runSerial (line 87) | void runSerial() {
  function checkResults (line 96) | void checkResults(const std::vector<std::unique_ptr<uint8_t[]>>& results) {
  function BM_serial (line 119) | void BM_serial(benchmark::State& state) {
  function runDispenso (line 125) | void runDispenso(std::vector<std::unique_ptr<uint8_t[]>>& results) {
  function BM_dispenso (line 144) | void BM_dispenso(benchmark::State& state) {
  function runDispensoPar (line 156) | void runDispensoPar(std::vector<std::unique_ptr<uint8_t[]>>& results) {
  function BM_dispenso_par (line 179) | void BM_dispenso_par(benchmark::State& state) {
  function runTBB (line 192) | void runTBB(std::vector<std::unique_ptr<uint8_t[]>>& results) {
  function BM_tbb (line 221) | void BM_tbb(benchmark::State& state) {
  function runTBBPar (line 231) | void runTBBPar(std::vector<std::unique_ptr<uint8_t[]>>& results) {
  function BM_tbb_par (line 260) | void BM_tbb_par(benchmark::State& state) {
  function runTaskflow (line 272) | void runTaskflow(std::vector<std::unique_ptr<uint8_t[]>>& results, tf::E...
  function BM_taskflow (line 307) | void BM_taskflow(benchmark::State& state) {
  function runTaskflowPar (line 318) | void runTaskflowPar(std::vector<std::unique_ptr<uint8_t[]>>& results, tf...
  function BM_taskflow_par (line 350) | void BM_taskflow_par(benchmark::State& state) {

FILE: benchmarks/pool_allocator_benchmark.cpp
  function run (line 21) | void run(benchmark::State& state, Alloc alloc, Free dealloc) {
  function runArena (line 34) | void runArena(benchmark::State& state, PoolAlloc& allocator) {
  function BM_mallocfree (line 45) | void BM_mallocfree(benchmark::State& state) {
  function BM_pool_allocator (line 53) | void BM_pool_allocator(benchmark::State& state) {
  function BM_nl_pool_allocator (line 62) | void BM_nl_pool_allocator(benchmark::State& state) {
  function BM_pool_allocator_arena (line 71) | void BM_pool_allocator_arena(benchmark::State& state) {
  function BM_nl_pool_allocator_arena (line 77) | void BM_nl_pool_allocator_arena(benchmark::State& state) {
  function runThreaded (line 83) | void runThreaded(benchmark::State& state, Alloc alloc, Free dealloc) {
  function BM_mallocfree_threaded (line 105) | void BM_mallocfree_threaded(benchmark::State& state) {
  function BM_pool_allocator_threaded (line 113) | void BM_pool_allocator_threaded(benchmark::State& state) {

FILE: benchmarks/run_benchmarks.py
  function get_machine_info (line 40) | def get_machine_info() -> Dict[str, Any]:
  function find_benchmarks (line 137) | def find_benchmarks(build_dir: Path, pattern: Optional[str] = None) -> L...
  function run_benchmark (line 163) | def run_benchmark(benchmark_path: Path, extra_args: List[str] = None) ->...
  function extract_benchmark_data (line 217) | def extract_benchmark_data(results: List[Dict]) -> pd.DataFrame:
  function generate_charts (line 248) | def generate_charts(df: pd.DataFrame, output_dir: Path):
  function generate_markdown_report (line 290) | def generate_markdown_report(
  function main (line 354) | def main():

FILE: benchmarks/rw_lock_benchmark.cpp
  function iterate (line 25) | int64_t iterate(MtxType& mtx, std::vector<int64_t>& values, int start, i...
  type NopMutex (line 41) | struct NopMutex {
    method lock (line 42) | void lock() {}
    method unlock (line 43) | void unlock() {}
    method lock_shared (line 44) | void lock_shared() {}
    method unlock_shared (line 45) | void unlock_shared() {}
  function BM_serial (line 49) | void BM_serial(benchmark::State& state) {
  function CustomArgumentsSerial (line 65) | static void CustomArgumentsSerial(benchmark::internal::Benchmark* b) {
  function BM_parallel (line 72) | void BM_parallel(benchmark::State& state) {
  function CustomArgumentsParallel (line 96) | static void CustomArgumentsParallel(benchmark::internal::Benchmark* b) {

FILE: benchmarks/simple_for_benchmark.cpp
  function BM_serial (line 55) | void BM_serial(benchmark::State& state) {
  function checkResults (line 66) | void checkResults(const std::vector<int>& input, const std::vector<int>&...
  function BM_dispenso (line 76) | void BM_dispenso(benchmark::State& state) {
  function BM_taskflow (line 99) | void BM_taskflow(benchmark::State& state) {
  function BM_dispenso_static_chunk (line 118) | void BM_dispenso_static_chunk(benchmark::State& state) {
  function BM_dispenso_auto_chunk (line 145) | void BM_dispenso_auto_chunk(benchmark::State& state) {
  function BM_omp (line 172) | void BM_omp(benchmark::State& state) {
  function BM_tbb (line 191) | void BM_tbb(benchmark::State& state) {
  function CustomArguments (line 213) | static void CustomArguments(benchmark::internal::Benchmark* b) {

FILE: benchmarks/simple_pool_benchmark.cpp
  type Work (line 31) | struct alignas(64) Work {
  function testTid (line 41) | inline int testTid() {
  function Work (line 49) | inline Work& work() {
  function BM_dispenso (line 62) | void BM_dispenso(benchmark::State& state) {
  function BM_dispenso_bulk (line 75) | void BM_dispenso_bulk(benchmark::State& state) {
  function BM_tbb (line 88) | void BM_tbb(benchmark::State& state) {
  function BM_folly (line 104) | void BM_folly(benchmark::State& state) {
  function CustomArguments (line 118) | static void CustomArguments(benchmark::internal::Benchmark* b) {

FILE: benchmarks/small_buffer_benchmark.cpp
  function run (line 17) | void run(benchmark::State& state, Alloc alloc, Free dealloc) {
  function BM_newdelete (line 30) | void BM_newdelete(benchmark::State& state) {
  function BM_small_buffer_allocator (line 35) | void BM_small_buffer_allocator(benchmark::State& state) {

FILE: benchmarks/summing_for_benchmark.cpp
  function checkResults (line 47) | void checkResults(const std::vector<int>& inputs, int64_t actual, int fo...
  function BM_serial (line 59) | void BM_serial(benchmark::State& state) {
  function BM_dispenso (line 73) | void BM_dispenso(benchmark::State& state) {
  function BM_omp (line 115) | void BM_omp(benchmark::State& state) {
  function BM_tbb (line 139) | void BM_tbb(benchmark::State& state) {
  function BM_dispenso_static (line 165) | void BM_dispenso_static(benchmark::State& state) {
  function CustomArguments (line 206) | static void CustomArguments(benchmark::internal::Benchmark* b) {

FILE: benchmarks/tbb_compat.h
  function namespace (line 41) | namespace tbb_compat {
  function namespace (line 69) | namespace tbb_compat {

FILE: benchmarks/thread_benchmark_common.h
  function std (line 20) | inline std::vector<int> pow2HalfStepThreads() {
  type rusage (line 35) | struct rusage
  function startRusage (line 37) | inline void startRusage() {
  function duration (line 43) | inline double duration(struct timeval start, struct timeval end) {
  function endRusage (line 47) | inline void endRusage(benchmark::State& state) {
  function startRusage (line 60) | inline void startRusage() {}
  function endRusage (line 61) | inline void endRusage(benchmark::State& state) {}
  function getMean (line 64) | inline double getMean(const std::vector<double>& data) {

FILE: benchmarks/timed_task_benchmark.cpp
  function getIterations (line 21) | size_t getIterations() {
  function absTimesToErrors (line 35) | void absTimesToErrors(std::vector<double>& times, double prevTime, doubl...
  function absTimesToSteadyErrors (line 45) | void absTimesToSteadyErrors(std::vector<double>& times, double prevTime,...
  function BM_folly (line 56) | void BM_folly(benchmark::State& state) {
  type FollyItem (line 97) | struct FollyItem {
    method FollyItem (line 103) | FollyItem() : times(getIterations()) {}
  function BM_folly_mixed (line 107) | void BM_folly_mixed(benchmark::State& state) {
  function BM_dispenso (line 183) | void BM_dispenso(benchmark::State& state) {
  type DispensoItem (line 230) | struct DispensoItem {
    method DispensoItem (line 235) | DispensoItem() : times(getIterations()) {}
  function BM_dispenso_mixed (line 239) | void BM_dispenso_mixed(benchmark::State& state) {

FILE: benchmarks/trivial_compute_benchmark.cpp
  function getInputs (line 28) | uint32_t getInputs(int num_elements) {
  function calculate (line 33) | inline uint64_t calculate(uint64_t input, uint64_t index, size_t foo) {
  function checkResults (line 39) | void checkResults(uint32_t input, uint64_t actual, int foo, size_t num_e...
  function BM_serial (line 57) | void BM_serial(benchmark::State& state) {
  function BM_dispenso (line 71) | void BM_dispenso(benchmark::State& state) {
  function BM_omp (line 115) | void BM_omp(benchmark::State& state) {
  function BM_tbb (line 139) | void BM_tbb(benchmark::State& state) {
  function BM_dispenso_static (line 165) | void BM_dispenso_static(benchmark::State& state) {
  function CustomArguments (line 206) | static void CustomArguments(benchmark::internal::Benchmark* b) {

FILE: dispenso/async_request.h
  function namespace (line 25) | namespace dispenso {

FILE: dispenso/completion_event.h
  function namespace (line 21) | namespace dispenso {

FILE: dispenso/concurrent_object_arena.h
  function namespace (line 24) | namespace detail {
  function namespace (line 36) | namespace dispenso {

FILE: dispenso/concurrent_vector.h
  function namespace (line 80) | namespace dispenso {
  function const_iterator (line 872) | const_iterator end() const {
  function const_iterator (line 897) | const_iterator cend() const {
  function reverse_iterator (line 922) | reverse_iterator rend() {
  function size_type (line 963) | constexpr size_type max_size() const noexcept {
  function swap (line 1014) | void swap(ConcurrentVector& oth) {
  function DISPENSO_INLINE (line 1047) | DISPENSO_INLINE cv::BucketInfo bucketAndSubIndex(size_t index) const {
  function internalFillN (line 1069) | void internalFillN(iterator it, size_t len, const T& value) {
  function internalFillDefaultN (line 1075) | void internalFillDefaultN(iterator it, size_t len) {
  function iterator (line 1081) | iterator growByUninitialized(size_type delta) {
  function iterator (line 1089) | iterator insertPartial(const_iterator pos) {
  function iterator (line 1098) | iterator insertPartial(const_iterator pos, size_t len) {
  function std (line 1133) | alignas(kCacheLineSize) std::atomic<size_t> size_{0}
  function DISPENSO_INLINE (line 1135) | DISPENSO_INLINE T* cachedBuffer(size_t bucket) const {
  function DISPENSO_INLINE (line 1148) | DISPENSO_INLINE void allocateBuffer(const cv::BucketInfo& binfo) {
  function DISPENSO_INLINE (line 1156) | DISPENSO_INLINE void
  function DISPENSO_INLINE (line 1165) | DISPENSO_INLINE void initCachedPtrs() {
  function DISPENSO_INLINE (line 1173) | DISPENSO_INLINE void setCachedPtr(size_t bucket, T* ptr) {
  function DISPENSO_INLINE (line 1182) | DISPENSO_INLINE void clearCachedPtr(size_t bucket) {

FILE: dispenso/detail/can_invoke.h
  function namespace (line 13) | namespace dispenso {

FILE: dispenso/detail/completion_event_impl.h
  function namespace (line 29) | namespace dispenso {

FILE: dispenso/detail/concurrent_vector_impl.h
  function namespace (line 10) | namespace cv {
  function detail (line 368) | const detail::AlignedAtomic<T>& operator[](size_t bucket) const {
  function allocAsNecessary (line 372) | void allocAsNecessary(const BucketInfo& binfo) {
  function allocAsNecessary (line 376) | void allocAsNecessary(const BucketInfo& binfo, T** cachedPtrs) {
  function allocAsNecessary (line 380) | void allocAsNecessary(const BucketInfo& binfo, ssize_t rangeLen, const B...
  function allocAsNecessary (line 384) | void allocAsNecessary(
  function shouldDealloc (line 393) | bool shouldDealloc(size_t bucket) const {
  function DISPENSO_INLINE (line 400) | DISPENSO_INLINE static size_t allocCheckIndex(size_t bucketCapacity) {

FILE: dispenso/detail/concurrent_vector_impl2.h
  function namespace (line 10) | namespace cv {

FILE: dispenso/detail/epoch_waiter.h
  function namespace (line 14) | namespace dispenso {

FILE: dispenso/detail/future_impl.h
  function namespace (line 14) | namespace dispenso {
  function move (line 529) | void move(FutureBase&& f) noexcept {
  function copy (line 538) | void copy(const FutureBase& f) {
  function future_status (line 566) | future_status wait_for(const std::chrono::duration<Rep, Period>& timeout...
  function future_status (line 571) | future_status wait_until(const std::chrono::time_point<Clock, Duration>&...
  function assertValid (line 588) | void assertValid() const {
  type ReadyTag (line 598) | struct ReadyTag {}

FILE: dispenso/detail/future_impl2.h
  function namespace (line 8) | namespace dispenso {
  function schedule (line 133) | struct InterceptionInvoker {
  function whenComplete (line 153) | auto whenComplete = [shared]() -> TupleType {
  function whenComplete (line 196) | auto whenComplete = [shared]() -> VecType {
  function tuple (line 250) | inline auto when_all(TaskSet&) -> Future<std::tuple<>> {
  function tuple (line 254) | inline auto when_all(ConcurrentTaskSet&) -> Future<std::tuple<>> {

FILE: dispenso/detail/graph_executor_impl.h
  function namespace (line 15) | namespace detail {
  function appendGroup (line 105) | static void appendGroup(
  function appendGroup (line 109) | static void appendGroup(

FILE: dispenso/detail/math.h
  function namespace (line 17) | namespace dispenso {
  function log2 (line 124) | inline uint32_t log2(uint32_t v) {
  function log2 (line 130) | inline uint32_t log2(uint32_t v) {
  function log2 (line 134) | inline uint32_t log2(uint32_t v) {
  function log2 (line 140) | inline uint32_t log2(uint32_t v) {

FILE: dispenso/detail/notifier_common.h
  function namespace (line 20) | namespace dispenso {
  function namespace (line 72) | namespace dispenso {
  function namespace (line 141) | namespace dispenso {

FILE: dispenso/detail/once_callable_impl.h
  function namespace (line 12) | namespace dispenso {

FILE: dispenso/detail/op_result.h
  function namespace (line 13) | namespace dispenso {

FILE: dispenso/detail/per_thread_info.cpp
  type dispenso (line 10) | namespace dispenso {
    type detail (line 11) | namespace detail {
      function PerThreadInfo (line 16) | PerThreadInfo& PerPoolPerThreadInfo::info() {

FILE: dispenso/detail/per_thread_info.h
  function namespace (line 12) | namespace dispenso {

FILE: dispenso/detail/pipeline_impl.h
  function namespace (line 20) | namespace dispenso {
  function wait (line 476) | void wait() {

FILE: dispenso/detail/quanta.cpp
  type dispenso (line 15) | namespace dispenso {
    type OsQuantaSetter (line 19) | struct OsQuantaSetter {
      method OsQuantaSetter (line 20) | OsQuantaSetter() {
    type OsQuantaSetter (line 30) | struct OsQuantaSetter {}
      method OsQuantaSetter (line 20) | OsQuantaSetter() {
    type detail (line 35) | namespace detail {
      function registerFineSchedulerQuanta (line 36) | void registerFineSchedulerQuanta() {

FILE: dispenso/detail/quanta.h
  function namespace (line 10) | namespace dispenso {

FILE: dispenso/detail/result_of.h
  function namespace (line 12) | namespace dispenso {

FILE: dispenso/detail/rw_lock_impl.h
  function namespace (line 10) | namespace dispenso {

FILE: dispenso/detail/small_buffer_allocator_impl.h
  function namespace (line 18) | namespace dispenso {
  function dealloc (line 86) | static void dealloc(char* buffer) {
  function bytesAllocated (line 107) | static size_t bytesAllocated() {
  type PerThreadQueuingData (line 119) | struct PerThreadQueuingData {
  function enqueue_bulk (line 134) | void enqueue_bulk(char** buffers, size_t count) {
  function try_dequeue_bulk (line 140) | size_t try_dequeue_bulk(char** buffers, size_t count) {
  function registerCleanup (line 162) | static void registerCleanup() {
  function grabFromCentralStore (line 171) | static size_t grabFromCentralStore(char** buffers) {
  function recycleToCentralStore (line 205) | static void recycleToCentralStore(char** buffers, size_t numToRecycle) {
  function DISPENSO_DLL_ACCESS (line 217) | DISPENSO_DLL_ACCESS static PerThreadQueuingData& getThreadQueuingData() {

FILE: dispenso/detail/task_set_impl.h
  function namespace (line 22) | namespace detail {
  function class (line 35) | class TaskSetBase {
  function registerChild (line 214) | void registerChild(TaskSetBase* child) {
  function unregisterChild (line 227) | void unregisterChild(TaskSetBase* child) {
  function cancelChildren (line 246) | void cancelChildren() {
  function std (line 256) | alignas(kCacheLineSize) std::atomic<ssize_t> outstandingTaskCount_{0}
  type ExceptionState (line 261) | enum ExceptionState { kUnset, kSetting, kSet }
  function TaskSetBase (line 269) | TaskSetBase* head_{nullptr};

FILE: dispenso/detail/timed_task_impl.h
  function namespace (line 10) | namespace dispenso {

FILE: dispenso/fast_math/detail/double_promote.h
  function namespace (line 30) | namespace dispenso {
  type DoubleVec (line 377) | struct DoubleVec
  function DISPENSO_INLINE (line 389) | static DISPENSO_INLINE DoubleVec gather(const double* base, NeonInt32 id...
  function DISPENSO_INLINE (line 399) | DISPENSO_INLINE NeonFloat to_float() const {
  function DoubleVec (line 420) | DoubleVec clamp(DoubleVec x, DoubleVec low, DoubleVec high) {
  function HwyFloat (line 467) | struct DoubleVec<HwyFloat> {

FILE: dispenso/fast_math/detail/fast_math_impl.h
  function namespace (line 13) | namespace dispenso {

FILE: dispenso/fast_math/fast_math.h
  function namespace (line 18) | namespace dispenso {
  function namespace (line 43) | namespace fast_math {
  function Flt (line 694) | Flt frexp(Flt x, IntType_t<Flt>* eptr) {
  function Flt (line 717) | Flt ldexp(Flt x, IntType_t<Flt> e) {
  function Flt (line 737) | Flt acos(Flt x) {
  function Flt (line 794) | Flt asin(Flt x) {
  function Flt (line 1178) | Flt exp(Flt x) {

FILE: dispenso/fast_math/float_traits.h
  function namespace (line 22) | namespace dispenso {

FILE: dispenso/fast_math/float_traits_avx.h
  function namespace (line 19) | namespace dispenso {
  type AvxInt32 (line 114) | struct AvxInt32 {
  type AvxUint32 (line 215) | struct AvxUint32 {
  function AvxUint32 (line 251) | AvxUint32 operator<<(int n) const {
  function AvxUint32 (line 254) | AvxUint32 operator>>(int n) const {
  function AvxFloat (line 313) | inline AvxFloat::AvxFloat(AvxInt32 i) : v(_mm256_cvtepi32_ps(i.v)) {}
  function AvxInt32 (line 316) | inline AvxInt32::AvxInt32(AvxUint32 u) : v(u.v) {}
  function __m256 (line 320) | struct SimdTypeFor<__m256> {
  function AvxFloat (line 354) | struct FloatTraits<AvxFloat> {
  function AvxFloat (line 400) | inline AvxFloat FloatTraits<AvxFloat>::conditional(AvxFloat mask, AvxFlo...
  function AvxInt32 (line 404) | inline AvxInt32 FloatTraits<AvxFloat>::conditional(AvxFloat mask, AvxInt...
  function AvxUint32 (line 409) | inline AvxUint32 FloatTraits<AvxFloat>::conditional(AvxFloat mask, AvxUi...
  function AvxFloat (line 415) | inline AvxFloat FloatTraits<AvxFloat>::conditional(AvxInt32 mask, AvxFlo...
  function AvxInt32 (line 419) | inline AvxInt32 FloatTraits<AvxFloat>::conditional(AvxInt32 mask, AvxInt...
  function AvxUint32 (line 424) | inline AvxUint32 FloatTraits<AvxFloat>::conditional(AvxInt32 mask, AvxUi...
  function AvxFloat (line 431) | inline AvxFloat FloatTraits<AvxFloat>::apply(AvxFloat mask, AvxFloat x) {
  function AvxInt32 (line 435) | inline AvxInt32 FloatTraits<AvxFloat>::apply(AvxFloat mask, AvxInt32 x) {
  function AvxUint32 (line 439) | inline AvxUint32 FloatTraits<AvxFloat>::apply(AvxFloat mask, AvxUint32 x) {
  function AvxInt32 (line 444) | struct FloatTraits<AvxInt32> {
  function AvxUint32 (line 449) | struct FloatTraits<AvxUint32> {
  function DISPENSO_INLINE (line 455) | DISPENSO_INLINE AvxFloat floor_small(AvxFloat x) {
  function DISPENSO_INLINE (line 459) | DISPENSO_INLINE AvxInt32 convert_to_int_trunc(AvxFloat f) {
  function DISPENSO_INLINE (line 463) | DISPENSO_INLINE AvxInt32 convert_to_int_trunc_safe(AvxFloat f) {
  function DISPENSO_INLINE (line 469) | DISPENSO_INLINE AvxInt32 convert_to_int(AvxFloat f) {
  function DISPENSO_INLINE (line 498) | DISPENSO_INLINE AvxInt32 int_div_by_3(AvxInt32 i) {
  function DISPENSO_INLINE (line 513) | DISPENSO_INLINE AvxInt32 nonnormal(AvxInt32 i) {
  function DISPENSO_INLINE (line 517) | DISPENSO_INLINE AvxInt32 nonnormalOrZero(AvxInt32 i) {
  function DISPENSO_INLINE (line 522) | DISPENSO_INLINE AvxInt32 nonnormal(AvxFloat f) {
  function DISPENSO_INLINE (line 527) | DISPENSO_INLINE bool any_true(AvxInt32 mask) {
  function DISPENSO_INLINE (line 531) | DISPENSO_INLINE AvxFloat signof(AvxFloat x) {
  function DISPENSO_INLINE (line 536) | DISPENSO_INLINE AvxInt32 signofi(AvxInt32 i) {

FILE: dispenso/fast_math/float_traits_avx512.h
  function namespace (line 19) | namespace dispenso {
  function operator (line 73) | inline operator Avx512Int32() const;
  type Avx512Int32 (line 166) | struct Avx512Int32 {
  type Avx512Uint32 (line 267) | struct Avx512Uint32 {
  function Avx512Uint32 (line 303) | Avx512Uint32 operator<<(int n) const {
  function Avx512Uint32 (line 306) | Avx512Uint32 operator>>(int n) const {
  function Avx512Float (line 371) | inline Avx512Float::Avx512Float(Avx512Int32 i) : v(_mm512_cvtepi32_ps(i....
  function Avx512Int32 (line 374) | inline Avx512Int32::Avx512Int32(Avx512Uint32 u) : v(u.v) {}
  function __m512 (line 378) | struct SimdTypeFor<__m512> {
  function Avx512Float (line 412) | struct FloatTraits<Avx512Float> {
  function Avx512Float (line 458) | inline Avx512Float
  function Avx512Int32 (line 463) | inline Avx512Int32
  function Avx512Uint32 (line 468) | inline Avx512Uint32
  function Avx512Float (line 477) | inline Avx512Float
  function Avx512Int32 (line 483) | inline Avx512Int32
  function Avx512Uint32 (line 489) | inline Avx512Uint32
  function Avx512Float (line 497) | inline Avx512Float FloatTraits<Avx512Float>::apply(Avx512Mask mask, Avx5...
  function Avx512Int32 (line 501) | inline Avx512Int32 FloatTraits<Avx512Float>::apply(Avx512Mask mask, Avx5...
  function Avx512Uint32 (line 505) | inline Avx512Uint32 FloatTraits<Avx512Float>::apply(Avx512Mask mask, Avx...
  function Avx512Int32 (line 510) | struct FloatTraits<Avx512Int32> {
  function Avx512Uint32 (line 515) | struct FloatTraits<Avx512Uint32> {
  function DISPENSO_INLINE (line 521) | DISPENSO_INLINE Avx512Float floor_small(Avx512Float x) {
  function DISPENSO_INLINE (line 525) | DISPENSO_INLINE Avx512Int32 convert_to_int_trunc(Avx512Float f) {
  function DISPENSO_INLINE (line 529) | DISPENSO_INLINE Avx512Int32 convert_to_int_trunc_safe(Avx512Float f) {
  function DISPENSO_INLINE (line 536) | DISPENSO_INLINE Avx512Int32 convert_to_int(Avx512Float f) {
  function DISPENSO_INLINE (line 569) | DISPENSO_INLINE Avx512Int32 int_div_by_3(Avx512Int32 i) {
  function DISPENSO_INLINE (line 584) | DISPENSO_INLINE Avx512Mask nonnormal(Avx512Int32 i) {
  function DISPENSO_INLINE (line 589) | DISPENSO_INLINE Avx512Mask nonnormalOrZero(Avx512Int32 i) {
  function DISPENSO_INLINE (line 596) | DISPENSO_INLINE Avx512Mask nonnormal(Avx512Float f) {
  function DISPENSO_INLINE (line 601) | DISPENSO_INLINE bool any_true(Avx512Mask mask) {
  function DISPENSO_INLINE (line 605) | DISPENSO_INLINE Avx512Float signof(Avx512Float x) {
  function DISPENSO_INLINE (line 610) | DISPENSO_INLINE Avx512Int32 signofi(Avx512Int32 i) {

FILE: dispenso/fast_math/float_traits_hwy.h
  function namespace (line 21) | namespace dispenso {
  type HwyInt32 (line 133) | struct HwyInt32 {
  type HwyUint32 (line 238) | struct HwyUint32 {
  function HwyFloat (line 337) | inline HwyFloat::HwyFloat(HwyInt32 i) : v(hn::ConvertTo(HwyFloatTag{}, i...
  function HwyInt32 (line 340) | inline HwyInt32::HwyInt32(HwyUint32 u) : v(hn::BitCast(HwyInt32Tag{}, u....
  function HwyFloatTag (line 344) | struct SimdTypeFor<hn::Vec<HwyFloatTag>> {
  function HwyFloat (line 378) | struct FloatTraits<HwyFloat> {
  function HwyFloat (line 422) | inline HwyFloat FloatTraits<HwyFloat>::conditional(HwyFloat mask, HwyFlo...
  function HwyInt32 (line 427) | inline HwyInt32 FloatTraits<HwyFloat>::conditional(HwyFloat mask, HwyInt...
  function HwyUint32 (line 434) | inline HwyUint32 FloatTraits<HwyFloat>::conditional(HwyFloat mask, HwyUi...
  function HwyFloat (line 443) | inline HwyFloat FloatTraits<HwyFloat>::conditional(HwyInt32 mask, HwyFlo...
  function HwyInt32 (line 450) | inline HwyInt32 FloatTraits<HwyFloat>::conditional(HwyInt32 mask, HwyInt...
  function HwyUint32 (line 455) | inline HwyUint32 FloatTraits<HwyFloat>::conditional(HwyInt32 mask, HwyUi...
  function HwyFloat (line 464) | inline HwyFloat FloatTraits<HwyFloat>::apply(HwyFloat mask, HwyFloat x) {
  function HwyInt32 (line 470) | inline HwyInt32 FloatTraits<HwyFloat>::apply(HwyFloat mask, HwyInt32 x) {
  function HwyUint32 (line 475) | inline HwyUint32 FloatTraits<HwyFloat>::apply(HwyFloat mask, HwyUint32 x) {
  function HwyInt32 (line 481) | struct FloatTraits<HwyInt32> {
  function HwyUint32 (line 486) | struct FloatTraits<HwyUint32> {
  function DISPENSO_INLINE (line 492) | DISPENSO_INLINE HwyFloat floor_small(HwyFloat x) {
  function DISPENSO_INLINE (line 496) | DISPENSO_INLINE HwyInt32 convert_to_int_trunc(HwyFloat f) {
  function DISPENSO_INLINE (line 502) | DISPENSO_INLINE HwyInt32 convert_to_int_trunc_safe(HwyFloat f) {
  function DISPENSO_INLINE (line 510) | DISPENSO_INLINE HwyInt32 convert_to_int(HwyFloat f) {
  function DISPENSO_INLINE (line 556) | DISPENSO_INLINE HwyInt32 int_div_by_3(HwyInt32 i) {

FILE: dispenso/fast_math/float_traits_neon.h
  function namespace (line 19) | namespace dispenso {
  type NeonInt32 (line 110) | struct NeonInt32 {
  function NeonInt32 (line 145) | NeonInt32 operator<<(int n) const {
  function NeonInt32 (line 148) | NeonInt32 operator>>(int n) const {
  type NeonUint32 (line 206) | struct NeonUint32 {
  function NeonUint32 (line 237) | NeonUint32 operator<<(int n) const {
  function NeonUint32 (line 240) | NeonUint32 operator>>(int n) const {
  function NeonFloat (line 300) | inline NeonFloat::NeonFloat(NeonInt32 i) : v(vcvtq_f32_s32(i.v)) {}
  function NeonInt32 (line 303) | inline NeonInt32::NeonInt32(NeonUint32 u) : v(vreinterpretq_s32_u32(u.v)...
  function float32x4_t (line 307) | struct SimdTypeFor<float32x4_t> {
  function NeonFloat (line 341) | struct FloatTraits<NeonFloat> {
  function NeonFloat (line 385) | inline NeonFloat FloatTraits<NeonFloat>::conditional(NeonFloat mask, Neo...
  function NeonInt32 (line 389) | inline NeonInt32 FloatTraits<NeonFloat>::conditional(NeonFloat mask, Neo...
  function NeonUint32 (line 394) | inline NeonUint32 FloatTraits<NeonFloat>::conditional(NeonFloat mask, Ne...
  function NeonFloat (line 400) | inline NeonFloat FloatTraits<NeonFloat>::conditional(NeonInt32 mask, Neo...
  function NeonInt32 (line 404) | inline NeonInt32 FloatTraits<NeonFloat>::conditional(NeonInt32 mask, Neo...
  function NeonUint32 (line 409) | inline NeonUint32 FloatTraits<NeonFloat>::conditional(NeonInt32 mask, Ne...
  function NeonFloat (line 415) | inline NeonFloat FloatTraits<NeonFloat>::apply(NeonFloat mask, NeonFloat...
  function NeonInt32 (line 420) | inline NeonInt32 FloatTraits<NeonFloat>::apply(NeonFloat mask, NeonInt32...
  function NeonUint32 (line 425) | inline NeonUint32 FloatTraits<NeonFloat>::apply(NeonFloat mask, NeonUint...
  function NeonInt32 (line 430) | struct FloatTraits<NeonInt32> {
  function NeonUint32 (line 435) | struct FloatTraits<NeonUint32> {
  function DISPENSO_INLINE (line 441) | DISPENSO_INLINE NeonFloat floor_small(NeonFloat x) {
  function DISPENSO_INLINE (line 445) | DISPENSO_INLINE NeonInt32 convert_to_int_trunc(NeonFloat f) {
  function DISPENSO_INLINE (line 449) | DISPENSO_INLINE NeonInt32 convert_to_int_trunc_safe(NeonFloat f) {
  function DISPENSO_INLINE (line 455) | DISPENSO_INLINE NeonInt32 convert_to_int(NeonFloat f) {
  function DISPENSO_INLINE (line 490) | DISPENSO_INLINE NeonInt32 int_div_by_3(NeonInt32 i) {
  function DISPENSO_INLINE (line 505) | DISPENSO_INLINE NeonInt32 nonnormal(NeonInt32 i) {
  function DISPENSO_INLINE (line 509) | DISPENSO_INLINE NeonInt32 nonnormalOrZero(NeonInt32 i) {
  function DISPENSO_INLINE (line 514) | DISPENSO_INLINE NeonInt32 nonnormal(NeonFloat f) {
  function DISPENSO_INLINE (line 519) | DISPENSO_INLINE bool any_true(NeonInt32 mask) {
  function DISPENSO_INLINE (line 523) | DISPENSO_INLINE NeonFloat signof(NeonFloat x) {
  function DISPENSO_INLINE (line 528) | DISPENSO_INLINE NeonInt32 signofi(NeonInt32 i) {

FILE: dispenso/fast_math/float_traits_x86.h
  function namespace (line 19) | namespace dispenso {
  type SseInt32 (line 114) | struct SseInt32 {
  type SseUint32 (line 215) | struct SseUint32 {
  function SseUint32 (line 251) | SseUint32 operator<<(int n) const {
  function SseUint32 (line 254) | SseUint32 operator>>(int n) const {
  function SseFloat (line 313) | inline SseFloat::SseFloat(SseInt32 i) : v(_mm_cvtepi32_ps(i.v)) {}
  function SseInt32 (line 316) | inline SseInt32::SseInt32(SseUint32 u) : v(u.v) {}
  function __m128 (line 320) | struct SimdTypeFor<__m128> {
  function SseFloat (line 354) | struct FloatTraits<SseFloat> {
  function SseFloat (line 400) | inline SseFloat FloatTraits<SseFloat>::conditional(SseFloat mask, SseFlo...
  function SseInt32 (line 404) | inline SseInt32 FloatTraits<SseFloat>::conditional(SseFloat mask, SseInt...
  function SseUint32 (line 408) | inline SseUint32 FloatTraits<SseFloat>::conditional(SseFloat mask, SseUi...
  function SseFloat (line 413) | inline SseFloat FloatTraits<SseFloat>::conditional(SseInt32 mask, SseFlo...
  function SseInt32 (line 417) | inline SseInt32 FloatTraits<SseFloat>::conditional(SseInt32 mask, SseInt...
  function SseUint32 (line 422) | inline SseUint32 FloatTraits<SseFloat>::conditional(SseInt32 mask, SseUi...
  function SseFloat (line 429) | inline SseFloat FloatTraits<SseFloat>::apply(SseFloat mask, SseFloat x) {
  function SseInt32 (line 433) | inline SseInt32 FloatTraits<SseFloat>::apply(SseFloat mask, SseInt32 x) {
  function SseUint32 (line 437) | inline SseUint32 FloatTraits<SseFloat>::apply(SseFloat mask, SseUint32 x) {
  function SseInt32 (line 442) | struct FloatTraits<SseInt32> {
  function SseUint32 (line 447) | struct FloatTraits<SseUint32> {
  function DISPENSO_INLINE (line 453) | DISPENSO_INLINE SseFloat floor_small(SseFloat x) {
  function DISPENSO_INLINE (line 457) | DISPENSO_INLINE SseInt32 convert_to_int_trunc(SseFloat f) {
  function DISPENSO_INLINE (line 461) | DISPENSO_INLINE SseInt32 convert_to_int_trunc_safe(SseFloat f) {
  function DISPENSO_INLINE (line 467) | DISPENSO_INLINE SseInt32 convert_to_int(SseFloat f) {
  function DISPENSO_INLINE (line 498) | DISPENSO_INLINE SseInt32 int_div_by_3(SseInt32 i) {
  function DISPENSO_INLINE (line 513) | DISPENSO_INLINE SseInt32 nonnormal(SseInt32 i) {
  function DISPENSO_INLINE (line 517) | DISPENSO_INLINE SseInt32 nonnormalOrZero(SseInt32 i) {
  function DISPENSO_INLINE (line 522) | DISPENSO_INLINE SseInt32 nonnormal(SseFloat f) {
  function DISPENSO_INLINE (line 527) | DISPENSO_INLINE bool any_true(SseInt32 mask) {
  function DISPENSO_INLINE (line 531) | DISPENSO_INLINE SseFloat signof(SseFloat x) {
  function DISPENSO_INLINE (line 536) | DISPENSO_INLINE SseInt32 signofi(SseInt32 i) {

FILE: dispenso/fast_math/util.h
  function namespace (line 22) | namespace dispenso {
  function Flt (line 378) | Flt hornerImpl(Flt, Flt accum) {
  function Flt (line 383) | Flt hornerImpl(Flt x, Flt accum, Flt next, Cs... rest) {
  function DISPENSO_INLINE (line 404) | DISPENSO_INLINE static Flt reduce(Flt, std::tuple<Flt> done) {
  function DISPENSO_INLINE (line 408) | DISPENSO_INLINE static Flt reduce(Flt xp, std::tuple<Flt, Flt> done) {
  function Flt (line 414) | static Flt reduce(Flt xp, std::tuple<Done...> done) {
  function Flt (line 420) | static Flt pairLevel(Flt xp, std::tuple<Paired...> paired) {
  function Flt (line 426) | static Flt pairLevel(Flt xp, std::tuple<Paired...> paired, Flt a) {
  function Flt (line 432) | static Flt
  function Flt (line 445) | Flt hornerEval(Flt x, C0 cn, Cs... rest) {
  function Flt (line 453) | Flt estrinEval(Flt x, C0 cn, Cs... rest) {
  function Flt (line 465) | Flt polyEval(Flt x, C0 cn, Cs... rest) {
  function namespace (line 497) | namespace dispenso {

FILE: dispenso/for_each.h
  function namespace (line 25) | namespace dispenso {

FILE: dispenso/future.h
  function namespace (line 26) | namespace dispenso {
  function Base (line 285) | Future(const Future& f) noexcept : Base(f) {}
  function Base (line 287) | Future(const Base& f) noexcept : Base(f) {}
  function Future (line 312) | Future share() {
  function Base (line 370) | Future(const Future& f) noexcept : Base(f) {}
  function Base (line 372) | Future(const Base& f) noexcept : Base(f) {}
  function Future (line 398) | Future share() {

FILE: dispenso/graph.cpp
  function set_union (line 16) | void set_union(
  function set_insert (line 24) | void set_insert(std::vector<const dispenso::BiPropNode*>& s, const dispe...
  type dispenso (line 32) | namespace dispenso {
    class DISPENSO_DLL_ACCESS (line 208) | class DISPENSO_DLL_ACCESS
    class DISPENSO_DLL_ACCESS (line 209) | class DISPENSO_DLL_ACCESS
    class DISPENSO_DLL_ACCESS (line 210) | class DISPENSO_DLL_ACCESS
    class DISPENSO_DLL_ACCESS (line 211) | class DISPENSO_DLL_ACCESS

FILE: dispenso/graph.h
  function namespace (line 226) | namespace detail {
  function namespace (line 243) | namespace dispenso {
  function clear (line 708) | inline void clear() {
  function clearSubgraphs (line 715) | inline void clearSubgraphs() {

FILE: dispenso/graph_executor.cpp
  type dispenso (line 10) | namespace dispenso {
    function setAllNodesIncomplete (line 130) | void setAllNodesIncomplete(const G& graph) {

FILE: dispenso/graph_executor.h
  function namespace (line 21) | namespace dispenso {

FILE: dispenso/latch.h
  function namespace (line 21) | namespace dispenso {

FILE: dispenso/once_function.h
  function namespace (line 22) | namespace dispenso {

FILE: dispenso/parallel_for.h
  function namespace (line 24) | namespace dispenso {
  function namespace (line 236) | namespace detail {
  type NoOpContainer (line 291) | struct NoOpContainer {
  function clear (line 300) | void clear() {}
  function NoOpIter (line 302) | NoOpIter begin() {
  function emplace_back (line 306) | void emplace_back(int) {}
  function front (line 308) | int& front() {
  type NoOpStateGen (line 314) | struct NoOpStateGen {
  function std (line 648) | alignas(kCacheLineSize) std::atomic<decltype(numChunks)> index(0);

FILE: dispenso/pipeline.h
  function namespace (line 20) | namespace dispenso {

FILE: dispenso/platform.h
  function namespace (line 28) | namespace dispenso {

FILE: dispenso/pool_allocator.cpp
  type dispenso (line 10) | namespace dispenso {
    class PoolAllocatorT<false> (line 113) | class PoolAllocatorT<false>
    class PoolAllocatorT<true> (line 114) | class PoolAllocatorT<true>

FILE: dispenso/pool_allocator.h
  function namespace (line 24) | namespace dispenso {

FILE: dispenso/priority.cpp
  type dispenso (line 29) | namespace dispenso {
    function ThreadPriority (line 35) | ThreadPriority getCurrentThreadPriority() {
    function setCurrentThreadPriority (line 40) | bool setCurrentThreadPriority(ThreadPriority prio) {
    function setCurrentThreadPriority (line 90) | bool setCurrentThreadPriority(ThreadPriority prio) {
    function setCurrentThreadPriority (line 124) | bool setCurrentThreadPriority(ThreadPriority prio) {
    function setCurrentThreadPriority (line 163) | bool setCurrentThreadPriority(ThreadPriority prio) {
    function setCurrentThreadPriority (line 194) | bool setCurrentThreadPriority(ThreadPriority prio) {

FILE: dispenso/priority.h
  function namespace (line 25) | namespace dispenso {

FILE: dispenso/resource_pool.h
  function namespace (line 21) | namespace dispenso {

FILE: dispenso/rw_lock.h
  function namespace (line 16) | namespace dispenso {

FILE: dispenso/schedulable.h
  function namespace (line 20) | namespace dispenso {
  function class (line 62) | class NewThreadInvoker {

FILE: dispenso/small_buffer_allocator.cpp
  type dispenso (line 13) | namespace dispenso {
    type detail (line 14) | namespace detail {
      function SmallBufferGlobals (line 17) | SmallBufferGlobals& getSmallBufferGlobals() {
      function deallocSmallBufferImpl (line 45) | void deallocSmallBufferImpl(size_t ordinal, void* buf) {
      function approxBytesAllocatedSmallBufferImpl (line 73) | size_t approxBytesAllocatedSmallBufferImpl(size_t ordinal) {
      class SmallBufferAllocator<4> (line 105) | class SmallBufferAllocator<4>
      class SmallBufferAllocator<8> (line 106) | class SmallBufferAllocator<8>
      class SmallBufferAllocator<16> (line 107) | class SmallBufferAllocator<16>
      class SmallBufferAllocator<32> (line 108) | class SmallBufferAllocator<32>
      class SmallBufferAllocator<64> (line 109) | class SmallBufferAllocator<64>
      class SmallBufferAllocator<128> (line 110) | class SmallBufferAllocator<128>
      class SmallBufferAllocator<256> (line 111) | class SmallBufferAllocator<256>

FILE: dispenso/small_buffer_allocator.h
  function namespace (line 20) | namespace dispenso {

FILE: dispenso/small_vector.h
  function namespace (line 30) | namespace dispenso {
  function size_ (line 100) | SmallVector(const SmallVector& other) : size_(0) {
  function reference (line 165) | reference operator[](size_type pos) {
  function const_reference (line 169) | const_reference operator[](size_type pos) const {
  function reference (line 173) | reference front() {
  function reference (line 181) | reference back() {
  function pointer (line 190) | pointer data() noexcept {
  function iterator (line 201) | iterator begin() noexcept {
  function const_iterator (line 205) | const_iterator begin() const noexcept {
  function iterator (line 213) | iterator end() noexcept {
  function size_type (line 232) | size_type size() const noexcept {
  function reserve (line 249) | void reserve(size_type newCap) {
  function clear (line 259) | void clear() noexcept {
  function push_back (line 265) | void push_back(const T& value) {
  function push_back (line 269) | void push_back(T&& value) {
  function pop_back (line 306) | void pop_back() {
  function resize (line 319) | void resize(size_type count) {
  function resize (line 344) | void resize(size_type count, const T& value) {
  function iterator (line 370) | iterator erase(const_iterator pos) {
  function setSize (line 400) | void setSize(size_type s) noexcept {
  function T (line 405) | T* inlineData() noexcept {
  function T (line 408) | const T* inlineData() const noexcept {
  function destroyAll (line 412) | void destroyAll() noexcept {
  function growToHeap (line 425) | void growToHeap(size_type newCap) {
  function ensureCapacity (line 444) | void ensureCapacity(size_type newCap) {
  type HeapStorage (line 457) | struct HeapStorage {

FILE: dispenso/spsc_ring_buffer.h
  function namespace (line 32) | namespace dispenso {

FILE: dispenso/task_set.cpp
  type dispenso (line 12) | namespace dispenso {
    type detail (line 14) | namespace detail {
      function pushThreadTaskSet (line 21) | void pushThreadTaskSet(TaskSetBase* t) {
      function popThreadTaskSet (line 30) | void popThreadTaskSet() {
    function TaskSetBase (line 41) | TaskSetBase* parentTaskSet() {

FILE: dispenso/task_set.h
  function ParentCascadeCancel (line 18) | enum class ParentCascadeCancel { kOff, kOn };

FILE: dispenso/third-party/moodycamel/blockingconcurrentqueue.h
  function namespace (line 19) | namespace moodycamel
  function swap (line 100) | inline void swap(BlockingConcurrentQueue& other) MOODYCAMEL_NOEXCEPT
  function enqueue (line 137) | inline bool enqueue(T&& item)
  function enqueue (line 150) | inline bool enqueue(producer_token_t const& token, T const& item)
  function enqueue (line 163) | inline bool enqueue(producer_token_t const& token, T&& item)
  function try_enqueue (line 209) | inline bool try_enqueue(T const& item)
  function try_enqueue (line 223) | inline bool try_enqueue(T&& item)
  function try_enqueue (line 235) | inline bool try_enqueue(producer_token_t const& token, T const& item)
  function try_enqueue (line 247) | inline bool try_enqueue(producer_token_t const& token, T&& item)
  function is_lock_free (line 547) | static constexpr bool is_lock_free()
  function destroy (line 562) | void destroy(U* p)

FILE: dispenso/third-party/moodycamel/concurrentqueue.h
  function namespace (line 87) | namespace moodycamel { namespace details {
  function namespace (line 95) | namespace moodycamel { namespace details {
  function namespace (line 105) | namespace moodycamel { namespace details {
  function namespace (line 113) | namespace moodycamel { namespace details {
  function namespace (line 158) | namespace moodycamel { namespace details {
  function namespace (line 241) | namespace moodycamel { namespace details {
  function namespace (line 280) | namespace moodycamel { namespace details {
  function namespace (line 294) | namespace moodycamel {
  type ConsumerToken (line 720) | struct ConsumerToken
  function swap (line 959) | inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT
  function enqueue (line 1010) | inline bool enqueue(T&& item)
  function enqueue (line 1020) | inline bool enqueue(producer_token_t const& token, T const& item)
  function enqueue (line 1029) | inline bool enqueue(producer_token_t const& token, T&& item)
  function try_enqueue (line 1064) | inline bool try_enqueue(T const& item)
  function try_enqueue (line 1075) | inline bool try_enqueue(T&& item)
  function try_enqueue (line 1084) | inline bool try_enqueue(producer_token_t const& token, T const& item)
  function try_enqueue (line 1092) | inline bool try_enqueue(producer_token_t const& token, T&& item)
  function size_approx (line 1329) | size_t size_approx() const
  function update_current_producer_after_rotation (line 1396) | inline bool update_current_producer_after_rotation(consumer_token_t& token)
  function freeListHead (line 1456) | freeListHead(nullptr) { }
  function swap (line 1458) | void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.f...
  function add (line 1463) | inline void add(N* node)
  function N (line 1477) | inline N* try_get()
  function N (line 1517) | N* head_unsafe() const { return freeListHead.load(std::memory_order_rela...
  type InnerQueueContext (line 1561) | enum InnerQueueContext { implicit_context = 0, explicit_context = 1 }
  function else (line 1563) | struct Block
  function T (line 1670) | inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT { return static_ca...
  function T (line 1671) | inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT { retu...
  type MemStats (line 1694) | struct MemStats
  type ProducerBase (line 1702) | struct ProducerBase
  function virtual (line 1715) | virtual ~ProducerBase() { }
  function ProducerBase (line 1739) | inline ProducerBase* next_prod() const { return static_cast<ProducerBase...
  function size_approx (line 1741) | inline size_t size_approx() const
  function ProducerBase (line 1773) | struct ExplicitProducer : public ProducerBase
  function else (line 1894) | else if (!new_block_index(pr_blockIndexSlotsUsed)) {
  type Guard (line 2028) | struct Guard {
  function else (line 2105) | else if (full || !new_block_index(originalBlockIndexSlotsUsed)) {
  function else (line 2185) | else {
  type BlockIndexEntry (line 2345) | struct BlockIndexEntry
  type BlockIndexHeader (line 2351) | struct BlockIndexHeader
  function new_block_index (line 2360) | bool new_block_index(size_t numberOfFilledSlotsToExpose)
  function ProducerBase (line 2425) | struct ImplicitProducer : public ProducerBase
  function else (line 2724) | else {
  type BlockIndexEntry (line 2889) | struct BlockIndexEntry
  type BlockIndexHeader (line 2895) | struct BlockIndexHeader
  function else (line 2925) | else if (!new_block_index()) {
  function rewind_block_index_tail (line 2939) | inline void rewind_block_index_tail()
  function BlockIndexEntry (line 2945) | inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) c...
  function get_block_index_index_for_index (line 2952) | inline size_t get_block_index_index_for_index(index_t index, BlockIndexH...
  function new_block_index (line 2970) | bool new_block_index()
  function populate_initial_block_list (line 3043) | void populate_initial_block_list(size_t blockCount)
  function Block (line 3060) | inline Block* try_get_block_from_initial_pool()
  function add_block_to_free_list (line 3071) | inline void add_block_to_free_list(Block* block)
  function add_blocks_to_free_list (line 3084) | inline void add_blocks_to_free_list(Block* block)
  function Block (line 3093) | inline Block* try_get_block_from_free_list()
  type MemStats (line 3123) | struct MemStats {
  function MemStats (line 3218) | MemStats getMemStats()
  function ProducerBase (line 3231) | ProducerBase* recycle_or_create_producer(bool isExplicit)
  function ProducerBase (line 3250) | ProducerBase* add_producer(ProducerBase* producer)
  function reown_producers (line 3283) | void reown_producers()
  type ImplicitProducerKVP (line 3298) | struct ImplicitProducerKVP
  function swap (line 3317) | inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT
  type ImplicitProducerHash (line 3329) | struct ImplicitProducerHash
  function populate_initial_implicit_producer_hash (line 3336) | inline void populate_initial_implicit_producer_hash()
  function swap_implicit_producer_hashes (line 3354) | void swap_implicit_producer_hashes(ConcurrentQueue& other)
  function ImplicitProducer (line 3392) | ImplicitProducer* get_or_add_implicit_producer()
  function swap (line 3727) | inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT
  function swap (line 3732) | inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT

FILE: dispenso/third-party/moodycamel/lightweightsemaphore.h
  type _SECURITY_ATTRIBUTES (line 19) | struct _SECURITY_ATTRIBUTES
  function namespace (line 37) | namespace moodycamel
  function tryWait (line 355) | bool tryWait()
  function wait (line 366) | bool wait()
  function wait (line 371) | bool wait(std::int64_t timeout_usecs)
  function tryWaitMany (line 377) | ssize_t tryWaitMany(ssize_t max)
  function waitMany (line 391) | ssize_t waitMany(ssize_t max, std::int64_t timeout_usecs)
  function waitMany (line 400) | ssize_t waitMany(ssize_t max)

FILE: dispenso/thread_id.cpp
  type dispenso (line 10) | namespace dispenso {
    function threadId (line 16) | uint64_t threadId() {

FILE: dispenso/thread_id.h
  function namespace (line 18) | namespace dispenso {

FILE: dispenso/thread_pool.cpp
  type dispenso (line 15) | namespace dispenso {
    function getAdjustedThreadCount (line 18) | size_t getAdjustedThreadCount(size_t requested) {
    function ThreadPool (line 236) | ThreadPool& globalThreadPool() {
    function resizeGlobalThreadPool (line 244) | void resizeGlobalThreadPool(size_t numThreads) {

FILE: dispenso/thread_pool.h
  type ForceQueuingTag (line 65) | struct ForceQueuingTag {}
  function class (line 72) | class alignas(kCacheLineSize) ThreadPool {
  function DISPENSO_DLL_ACCESS (line 118) | DISPENSO_DLL_ACCESS void resize(ssize_t n) {
  function setSignalingWake (line 199) | void setSignalingWake(bool enable, uint32_t sleepDurationUs) {
  function conditionallyWake (line 235) | void conditionallyWake() {
  function delete (line 257) | static void operator delete(void* ptr) {
  function std (line 280) | alignas(kCacheLineSize) std::atomic<ssize_t> numSleeping_{0}
  function std (line 282) | alignas(kCacheLineSize) std::atomic<ssize_t> workRemaining_{0}
  function std (line 289) | alignas(kCacheLineSize) std::atomic<ssize_t> outstandingTaskSets_{0};
  function schedule (line 364) | void ThreadPool::schedule(moodycamel::ProducerToken& token, F&& f, Force...
  function tryExecuteNext (line 379) | inline bool ThreadPool::tryExecuteNext() {
  function tryExecuteNextFromProducerToken (line 391) | inline bool ThreadPool::tryExecuteNextFromProducerToken(moodycamel::Prod...
  function executeNext (line 400) | inline void ThreadPool::executeNext(OnceFunction next) {
  function namespace (line 405) | namespace detail {
  function ProducerToken (line 438) | ProducerToken* token) {

FILE: dispenso/timed_task.cpp
  type dispenso (line 13) | namespace dispenso {
    function TimedTaskScheduler (line 118) | TimedTaskScheduler& globalTimedTaskScheduler() {

FILE: dispenso/timed_task.h
  function namespace (line 25) | namespace dispenso {
  function running_ (line 306) | bool running_{true};

FILE: dispenso/timing.cpp
  type dispenso (line 26) | namespace dispenso {
    function rdtscp (line 31) | inline uint64_t rdtscp() {
    function rdtscp (line 37) | inline uint64_t rdtscp() {
    function rdtscp (line 48) | uint64_t rdtscp(void) {
    function snapFreq (line 60) | static bool snapFreq(double& firstApprox) {
    function fallbackTicksPerSecond (line 91) | static double fallbackTicksPerSecond() {
    function ticksPerSecond (line 134) | static double ticksPerSecond() {
    function ticksPerSecond (line 140) | static double ticksPerSecond() {
    function ticksPerSecond (line 148) | double ticksPerSecond() {
    function getTime (line 153) | double getTime() {
    function getTime (line 161) | double getTime() {

FILE: dispenso/timing.h
  function namespace (line 18) | namespace dispenso {

FILE: dispenso/tsan_annotations.cpp
  type dispenso (line 36) | namespace dispenso {
    type detail (line 37) | namespace detail {
      function annotateIgnoreWritesBegin (line 39) | void annotateIgnoreWritesBegin(const char* f, int l) {
      function annotateIgnoreWritesEnd (line 42) | void annotateIgnoreWritesEnd(const char* f, int l) {
      function annotateIgnoreReadsBegin (line 45) | void annotateIgnoreReadsBegin(const char* f, int l) {
      function annotateIgnoreReadsEnd (line 48) | void annotateIgnoreReadsEnd(const char* f, int l) {
      function annotateNewMemory (line 52) | void annotateNewMemory(const char* f, int l, const volatile void* ad...
      function annotateHappensBefore (line 56) | void annotateHappensBefore(const char* f, int l, const volatile void...
      function annotateHappensAfter (line 60) | void annotateHappensAfter(const char* f, int l, const volatile void*...

FILE: dispenso/tsan_annotations.h
  function namespace (line 39) | namespace dispenso {

FILE: dispenso/util.h
  function namespace (line 24) | namespace dispenso {

FILE: dispenso/utils/graph_dot.h
  function namespace (line 13) | namespace detail {
  function namespace (line 29) | namespace dispenso {

FILE: docs/third-party/doxygen-awesome/doxygen-awesome-darkmode-toggle.js
  class DoxygenAwesomeDarkModeToggle (line 11) | class DoxygenAwesomeDarkModeToggle extends HTMLElement {
    method init (line 39) | static init() {
    method constructor (line 65) | constructor() {
    method systemPreference (line 73) | static get systemPreference() {
    method userPreference (line 80) | static get userPreference() {
    method userPreference (line 85) | static set userPreference(userPreference) {
    method enableDarkMode (line 103) | static enableDarkMode(enable) {
    method onSystemPreferenceChanged (line 115) | static onSystemPreferenceChanged() {
    method onUserPreferenceChanged (line 120) | static onUserPreferenceChanged() {
    method toggleDarkMode (line 124) | toggleDarkMode() {
    method updateIcon (line 129) | updateIcon() {

FILE: examples/concurrent_vector_example.cpp
  function main (line 20) | int main() {

FILE: examples/for_each_example.cpp
  function main (line 20) | int main() {

FILE: examples/future_example.cpp
  function main (line 20) | int main() {

FILE: examples/graph_example.cpp
  function main (line 19) | int main() {

FILE: examples/latch_example.cpp
  function main (line 22) | int main() {

FILE: examples/parallel_for_example.cpp
  function main (line 19) | int main() {

FILE: examples/pipeline_example.cpp
  function main (line 20) | int main() {

FILE: examples/resource_pool_example.cpp
  class Buffer (line 21) | class Buffer {
    method Buffer (line 23) | Buffer() : data_(1024, 0), useCount_(0) {}
    method process (line 25) | void process(int value) {
    method checksum (line 33) | int checksum() const {
    method useCount (line 41) | int useCount() const {
  class DatabaseConnection (line 51) | class DatabaseConnection {
    method DatabaseConnection (line 53) | DatabaseConnection(int id) : connectionId_(id), queryCount_(0) {}
    method executeQuery (line 55) | int executeQuery(int param) {
    method connectionId (line 60) | int connectionId() const {
    method queryCount (line 63) | int queryCount() const {
  function main (line 72) | int main() {

FILE: examples/task_set_example.cpp
  function main (line 20) | int main() {

FILE: scripts/compare_benchmarks.py
  function format_time (line 26) | def format_time(ns: float) -> str:
  function _pct_change (line 38) | def _pct_change(val: float, base_val: float) -> Optional[float]:
  function _normalize_to_ns (line 48) | def _normalize_to_ns(time_val: float, unit: str) -> float:
  function load_benchmarks (line 53) | def load_benchmarks(
  function get_compiler_info (line 98) | def get_compiler_info(path: Path) -> str:
  function _format_comparison_cell (line 107) | def _format_comparison_cell(val: Optional[float], base_val: Optional[flo...
  function _print_summary (line 121) | def _print_summary(
  function compare (line 156) | def compare(
  function main (line 214) | def main():

FILE: scripts/generate_charts.py
  function get_library_color (line 63) | def get_library_color(name: str) -> str:
  function get_time_scale (line 80) | def get_time_scale(values, original_unit: str = "ns"):
  function extract_benchmark_data (line 116) | def extract_benchmark_data(results: List[Dict]) -> "pd.DataFrame":
  function parse_benchmark_name (line 149) | def parse_benchmark_name(name: str) -> Dict[str, Any]:
  function generate_line_chart (line 189) | def generate_line_chart(
  function generate_bar_chart (line 266) | def generate_bar_chart(suite: str, suite_df: "pd.DataFrame", output_dir:...
  function generate_grouped_bar_chart (line 380) | def generate_grouped_bar_chart(
  function generate_horizontal_grouped_bar_chart (line 460) | def generate_horizontal_grouped_bar_chart(
  function generate_concurrent_vector_charts (line 555) | def generate_concurrent_vector_charts(
  function generate_simple_for_charts (line 626) | def generate_simple_for_charts(
  function _parse_future_benchmark (line 780) | def _parse_future_benchmark(name: str):
  function _format_time_value (line 807) | def _format_time_value(val_ns):
  function _add_future_bar_value_labels (line 831) | def _add_future_bar_value_labels(ax, all_bars, all_values, y_max):
  function _render_future_bar_chart (line 860) | def _render_future_bar_chart(
  function generate_future_charts (line 961) | def generate_future_charts(suite_df: "pd.DataFrame", output_dir: Path):
  function generate_graph_charts (line 1032) | def generate_graph_charts(suite_df: "pd.DataFrame", output_dir: Path):
  function _clean_allocator_name (line 1139) | def _clean_allocator_name(name):
  function _generate_small_buffer_charts (line 1153) | def _generate_small_buffer_charts(suite_df: "pd.DataFrame", output_dir: ...
  function _generate_pool_allocator_split_charts (line 1245) | def _generate_pool_allocator_split_charts(
  function generate_allocator_charts (line 1310) | def generate_allocator_charts(suite_df: "pd.DataFrame", output_dir: Path...
  function _parse_rw_lock_name (line 1319) | def _parse_rw_lock_name(name: str):
  function _render_rw_lock_parallel_chart (line 1343) | def _render_rw_lock_parallel_chart(
  function generate_rw_lock_charts (line 1397) | def generate_rw_lock_charts(suite_df: "pd.DataFrame", output_dir: Path):
  function generate_once_function_charts (line 1487) | def generate_once_function_charts(suite_df: "pd.DataFrame", output_dir: ...
  function generate_pool_allocator_charts (line 1604) | def generate_pool_allocator_charts(suite_df: "pd.DataFrame", output_dir:...
  function generate_simple_horizontal_chart (line 1782) | def generate_simple_horizontal_chart(
  function _parse_timed_task_row (line 1827) | def _parse_timed_task_row(row):
  function _build_timed_task_config_order (line 1876) | def _build_timed_task_config_order(timed_df: "pd.DataFrame") -> list:
  function generate_timed_task_charts (line 1895) | def generate_timed_task_charts(suite_df: "pd.DataFrame", output_dir: Path):
  function generate_charts (line 1962) | def generate_charts(df: "pd.DataFrame", output_dir: Path):
  function _format_worksize (line 2049) | def _format_worksize(worksize) -> str:
  function _write_suite_detail_table (line 2116) | def _write_suite_detail_table(suite: str, suite_df: "pd.DataFrame", outp...
  function _write_suite_charts_markdown (line 2130) | def _write_suite_charts_markdown(f, suite: str, suite_parsed: "pd.DataFr...
  function generate_markdown_report (line 2187) | def generate_markdown_report(
  function main (line 2250) | def main():

FILE: scripts/generate_plotly_benchmarks.py
  function get_color (line 90) | def get_color(name):
  function parse_benchmark_name (line 115) | def parse_benchmark_name(name):
  function _format_worksize (line 146) | def _format_worksize(ws):
  function _lib_sort_key (line 158) | def _lib_sort_key(lib):
  function _parse_line_benchmarks (line 174) | def _parse_line_benchmarks(benchmarks):
  function build_line_charts (line 213) | def build_line_charts(benchmarks, suite):
  function _parse_concurrent_vector_benchmarks (line 296) | def _parse_concurrent_vector_benchmarks(benchmarks):
  function _build_cv_group_chart (line 313) | def _build_cv_group_chart(data, suite, group_name, ops_list, suffix, tit...
  function build_concurrent_vector_charts (line 338) | def build_concurrent_vector_charts(benchmarks, suite):
  function _parse_future_benchmarks (line 372) | def _parse_future_benchmarks(benchmarks, impl_map, sizes):
  function _build_error_aware_groups (line 396) | def _build_error_aware_groups(impls, available_sizes, data, error_set):
  function build_future_charts (line 420) | def build_future_charts(benchmarks, suite):
  function build_graph_charts (line 469) | def build_graph_charts(benchmarks, suite):
  function _parse_rw_lock_benchmarks (line 540) | def _parse_rw_lock_benchmarks(benchmarks):
  function _build_grouped_bar_chart (line 565) | def _build_grouped_bar_chart(chart_id, suite, title, categories, mutexes...
  function _build_rw_lock_parallel_charts (line 585) | def _build_rw_lock_parallel_charts(data, suite, mutexes):
  function build_rw_lock_charts (line 607) | def build_rw_lock_charts(benchmarks, suite):
  function _parse_once_function_benchmarks (line 649) | def _parse_once_function_benchmarks(benchmarks):
  function build_once_function_charts (line 673) | def build_once_function_charts(benchmarks, suite):
  function _parse_pool_allocator_benchmarks (line 720) | def _parse_pool_allocator_benchmarks(benchmarks):
  function _build_pool_allocator_chart (line 762) | def _build_pool_allocator_chart(data, suite, tc):
  function build_pool_allocator_charts (line 805) | def build_pool_allocator_charts(benchmarks, suite):
  function build_small_buffer_charts (line 817) | def build_small_buffer_charts(benchmarks, suite):
  function _parse_timed_task_benchmark (line 873) | def _parse_timed_task_benchmark(bm):
  function build_timed_task_charts (line 909) | def build_timed_task_charts(benchmarks, suite):
  function build_pipeline_charts (line 963) | def build_pipeline_charts(benchmarks, suite):
  function _parse_generic_line_benchmarks (line 1006) | def _parse_generic_line_benchmarks(benchmarks):
  function _skipped_libs_list (line 1032) | def _skipped_libs_list(errored_libs, valid_libs):
  function build_generic_line_charts (line 1040) | def build_generic_line_charts(benchmarks, suite):
  function build_generic_bar_charts (line 1106) | def build_generic_bar_charts(benchmarks, suite):
  function build_charts_for_suite (line 1171) | def build_charts_for_suite(benchmarks, suite):
  function generate_html (line 1207) | def generate_html(platform_data, output_path):
  function load_platform (line 1708) | def load_platform(json_path):
  function main (line 1735) | def main():

FILE: scripts/run_benchmarks.py
  function get_machine_info (line 44) | def get_machine_info() -> Dict[str, Any]:
  function generate_platform_id (line 141) | def generate_platform_id(machine_info: Dict[str, Any]) -> str:
  function _parse_cmake_cache (line 187) | def _parse_cmake_cache(cache_path: Path, var_map: list) -> Dict[str, str]:
  function _parse_compiler_cmake (line 211) | def _parse_compiler_cmake(cmake_files_dir: Path, var_map: list) -> Dict[...
  function _build_compiler_summary (line 239) | def _build_compiler_summary(info: Dict[str, str]) -> str:
  function get_compiler_info (line 253) | def get_compiler_info(build_dir: Path) -> Dict[str, str]:
  function _discover_benchmark_targets (line 295) | def _discover_benchmark_targets(build_dir: Path, pattern: str) -> List[s...
  function configure_and_build (line 328) | def configure_and_build(
  function find_benchmarks (line 425) | def find_benchmarks(build_dir: Path, pattern: Optional[str] = None) -> L...
  function run_benchmark (line 466) | def run_benchmark(
  function _build_extra_benchmark_args (line 548) | def _build_extra_benchmark_args(args) -> List[str]:
  function _build_windows_env_override (line 558) | def _build_windows_env_override(
  function main (line 583) | def main():

FILE: scripts/update_benchmarks.py
  function run_command (line 72) | def run_command(cmd: list, description: str, cwd: Path = None) -> bool:
  function configure_build (line 82) | def configure_build(build_dir: Path, source_dir: Path, jobs: int) -> bool:
  function build_benchmarks (line 104) | def build_benchmarks(build_dir: Path, jobs: int) -> bool:
  function copy_all_charts (line 110) | def copy_all_charts(charts_dir: Path, docs_dir: Path) -> int:
  function copy_readme_charts (line 139) | def copy_readme_charts(charts_dir: Path, docs_dir: Path) -> list:
  function generate_landing_page (line 159) | def generate_landing_page(docs_dir: Path, platforms: list) -> Path:
  function compose_platforms (line 196) | def compose_platforms(platform_specs: list, docs_dir: Path) -> bool:
  function _step_build (line 285) | def _step_build(build_dir: Path, source_dir: Path, jobs: int, skip: bool...
  function _step_run (line 309) | def _step_run(
  function _step_generate_charts (line 353) | def _step_generate_charts(
  function main (line 391) | def main():

FILE: scripts/update_package_managers.py
  function parse_args (line 66) | def parse_args():
  function download_and_hash (line 124) | def download_and_hash(version):
  function run (line 179) | def run(cmd, cwd=None, check=True, dry_run=False, capture=False, env=None):
  function ensure_repo (line 202) | def ensure_repo(repos_dir, manager, github_user, dry_run):
  function checkout_branch (line 268) | def checkout_branch(repo_dir, branch, dry_run):
  function commit_and_push (line 344) | def commit_and_push(repo_dir, branch, message, github_user, dry_run, ski...
  function test_conan (line 426) | def test_conan(repo_dir, version):
  function test_vcpkg (line 447) | def test_vcpkg(repo_dir, version):
  function test_homebrew (line 491) | def test_homebrew(repo_dir, version):
  function test_macports (line 561) | def test_macports(repo_dir, version, hashes=None):
  function get_macos_tested_on (line 657) | def get_macos_tested_on():
  function verify_portfile_checksums (line 713) | def verify_portfile_checksums(repo_dir, hashes):
  function verify_formula_checksums (line 737) | def verify_formula_checksums(repo_dir, hashes):
  function ensure_conan_issue (line 764) | def ensure_conan_issue(version, github_user, dry_run):
  function update_conan (line 850) | def update_conan(args, hashes, tarball_path):
  function detect_obsolete_patches (line 949) | def detect_obsolete_patches(tarball_path, port_dir, strip_level=1):
  function remove_obsolete_patches (line 1009) | def remove_obsolete_patches(port_dir, portfile_path, obsolete_patches):
  function _vcpkg_update_port_files (line 1049) | def _vcpkg_update_port_files(repo_dir, version, hashes, dry_run):
  function _vcpkg_cleanup_patches (line 1086) | def _vcpkg_cleanup_patches(repo_dir, tarball_path, dry_run):
  function _macports_cleanup_patches (line 1103) | def _macports_cleanup_patches(repo_dir, tarball_path, dry_run):
  function _vcpkg_run_tooling (line 1157) | def _vcpkg_run_tooling(repo_dir, vcpkg_json_path, dry_run):
  function _vcpkg_verify_port_files (line 1203) | def _vcpkg_verify_port_files(repo_dir, version, hashes):
  function update_vcpkg (line 1245) | def update_vcpkg(args, hashes, tarball_path):
  function update_homebrew (line 1304) | def update_homebrew(args, hashes, tarball_path):
  function update_macports (line 1396) | def update_macports(args, hashes, tarball_path):
  function pr_body_homebrew (line 1518) | def pr_body_homebrew(version, tests_ran):
  function pr_body_conan (line 1552) | def pr_body_conan(version, tests_ran, issue_number=None):
  function pr_body_vcpkg (line 1575) | def pr_body_vcpkg(version):
  function pr_body_macports (line 1588) | def pr_body_macports(version, tests_ran, tested_on=None):
  function pr_title (line 1624) | def pr_title(manager, version):
  function pre_pr_checklist (line 1635) | def pre_pr_checklist(manager, version, tests_ran):
  function get_default_branch (line 1673) | def get_default_branch(upstream_repo):
  function close_superseded_prs (line 1685) | def close_superseded_prs(upstream_repo, version, github_user, dry_run):
  function create_pr (line 1748) | def create_pr(upstream_repo, branch, title, body, github_user, dry_run):
  function create_prs_phase (line 1810) | def create_prs_phase(results, args):
  function post_pr_steps (line 1861) | def post_pr_steps(pr_urls, version):
  function print_summary (line 1880) | def print_summary(results, github_user):
  function prompt_continue (line 1959) | def prompt_continue(message="Continue?"):
  function _detect_default_branch (line 1971) | def _detect_default_branch(repo_dir):
  function _guided_run_updates (line 1992) | def _guided_run_updates(args, managers, hashes, tarball_path):
  function _print_manager_result (line 2038) | def _print_manager_result(mgr, result):
  function _guided_push_branches (line 2053) | def _guided_push_branches(args, pushable):
  function _guided_create_prs (line 2091) | def _guided_create_prs(pushed, args):
  function _guided_review_push_pr (line 2116) | def _guided_review_push_pr(args, results, pushable, tarball_path):
  function _guided_offer_pr_creation (line 2183) | def _guided_offer_pr_creation(args, pushed, results, tarball_path):
  function guided_flow (line 2212) | def guided_flow(args):
  function _guided_cleanup (line 2269) | def _guided_cleanup(tarball_path):
  function main (line 2276) | def main():

FILE: tests/async_request_test.cpp
  function TEST (line 14) | TEST(AsyncRequest, SequentialAsExpected) {
  function TEST (line 34) | TEST(AsyncRequest, AsyncAsExpected) {

FILE: tests/chunked_for_test.cpp
  function TEST (line 15) | TEST(ChunkedFor, SimpleLoop) {
  function TEST (line 38) | TEST(ChunkedFor, ShouldNotInvokeIfEmptyRange) {
  function TEST (line 50) | TEST(ChunkedFor, SimpleLoopStatic) {
  function TEST (line 78) | TEST(ChunkedFor, SimpleLoopAuto) {
  function loopWithStateImpl (line 107) | void loopWithStateImpl() {
  function TEST (line 137) | TEST(ChunkedFor, LoopWithDequeState) {
  function TEST (line 140) | TEST(ChunkedFor, LoopWithVectorState) {
  function TEST (line 143) | TEST(ChunkedFor, LoopWithListState) {
  function TEST (line 147) | TEST(ChunkedFor, SimpleLoopSmallRangeAtLargeValues) {
  function TEST (line 166) | TEST(ChunkedFor, SimpleLoopSmallRange) {
  function TEST (line 185) | TEST(ChunkedFor, LoopSmallRangeWithState) {
  function TEST (line 214) | TEST(ChunkedFor, SimpleLoopSmallRangeExternalWait) {
  function TEST (line 238) | TEST(ChunkedFor, LoopSmallRangeWithStateWithExternalWait) {
  function minChunkSize (line 272) | static void minChunkSize(dispenso::ParForChunking choice, int start, int...
  function TEST (line 296) | TEST(ChunkedFor, MinChunkSizeLoopAuto) {
  function TEST (line 304) | TEST(ChunkedFor, MinChunkSizeLoopStatic) {
  function loopWithStateImplReuseState (line 313) | void loopWithStateImplReuseState() {
  function TEST (line 350) | TEST(ChunkedFor, LoopWithDequeStateReuse) {
  function TEST (line 353) | TEST(ChunkedFor, LoopWithVectorStateReuse) {
  function TEST (line 356) | TEST(ChunkedFor, LoopWithListStateReuse) {

FILE: tests/completion_event_test.cpp
  function TEST (line 15) | TEST(CompletionEvent, NotifyBeforeWait) {
  function TEST (line 23) | TEST(CompletionEvent, NotifyBeforeWaitFor) {
  function TEST (line 38) | TEST(CompletionEvent, WaitForSomeTime) {
  function TEST (line 55) | TEST(CompletionEvent, WaitForSomeTimeWithReset) {
  function TEST (line 89) | TEST(CompletionEvent, EffectiveBarrier) {

FILE: tests/concurrent_object_arena_test.cpp
  function TEST (line 13) | TEST(ConcurrentObjectArena, ParallelGrowBy) {
  function TEST (line 52) | TEST(ConcurrentObjectArena, ObjectsConstuction) {
  function TEST (line 111) | TEST(ConcurrentObjectArena, BufferSizeRounding) {
  function TEST (line 125) | TEST(ConcurrentObjectArena, ExactPowerOfTwoBufferSize) {
  function TEST (line 135) | TEST(ConcurrentObjectArena, MoveAssignment) {
  function TEST (line 167) | TEST(ConcurrentObjectArena, SwapFunction) {
  function TEST (line 197) | TEST(ConcurrentObjectArena, ConstAccess) {
  function TEST (line 224) | TEST(ConcurrentObjectArena, DifferentIndexType) {
  function TEST (line 240) | TEST(ConcurrentObjectArena, CustomAlignment) {
  function TEST (line 252) | TEST(ConcurrentObjectArena, GrowByZero) {
  function TEST (line 260) | TEST(ConcurrentObjectArena, SingleElementGrowth) {

FILE: tests/concurrent_vector_test_common.h
  function string (line 1050) | string printVec(const V& vec) {
  type NonMovable (line 1418) | struct NonMovable {

FILE: tests/fast_math/acos_test.cpp
  function TEST (line 17) | TEST(Acos, OutOfRange) {
  function TEST (line 24) | TEST(Acos, SpecialVals) {
  function TEST (line 41) | TEST(Acos, Range) {

FILE: tests/fast_math/asin_test.cpp
  function TEST (line 16) | TEST(Asin, OutOfRange) {
  function TEST (line 23) | TEST(Asin, SpecialVals) {
  function TEST (line 32) | TEST(Asin, Range) {

FILE: tests/fast_math/atan2_test.cpp
  function TEST (line 19) | TEST(Atan2, SpecialVals) {
  type BoundsTraits (line 68) | struct BoundsTraits {
  function TEST (line 73) | TEST(Atan2WBounds, SpecialVals) {
  function TEST (line 92) | TEST(Atan2, RangeNearZero) {
  function TEST (line 115) | TEST(Atan, RandomSamples) {
  function Flt (line 137) | Flt atan2_max(Flt y, Flt x) {

FILE: tests/fast_math/atan_test.cpp
  function TEST (line 22) | TEST(Atan, SpecialVals) {
  function TEST (line 38) | TEST(Atan, Range) {

FILE: tests/fast_math/avx512_test.cpp
  function lane (line 26) | static float lane(__m512 v, int i) {
  function lane (line 33) | static int32_t lane(__m512i v, int i) {
  function maskBit (line 40) | static bool maskBit(Avx512Mask m, int i) {
  function __m512 (line 45) | static __m512 make16(
  function TEST (line 67) | TEST(Avx512Mask, ConstructFromInt) {
  function TEST (line 74) | TEST(Avx512Mask, LogicalOps) {
  function TEST (line 91) | TEST(Avx512Mask, Equality) {
  function TEST (line 109) | TEST(Avx512Mask, ToInt32Conversion) {
  function TEST (line 120) | TEST(Avx512Float, Broadcast) {
  function TEST (line 127) | TEST(Avx512Float, Arithmetic) {
  function TEST (line 150) | TEST(Avx512Float, Negation) {
  function TEST (line 180) | TEST(Avx512Float, CompoundAssignment) {
  function TEST (line 192) | TEST(Avx512Float, Comparisons) {
  function TEST (line 224) | TEST(Avx512Int32, BasicOps) {
  function TEST (line 234) | TEST(Avx512Int32, Negation) {
  function TEST (line 243) | TEST(Avx512Int32, ShiftOps) {
  function TEST (line 254) | TEST(Avx512Int32, Comparisons) {
  function TEST (line 282) | TEST(Avx512Uint32, LogicalShift) {
  function TEST (line 290) | TEST(Avx512Uint32, UnsignedComparison) {
  function TEST (line 302) | TEST(Avx512BitCast, FloatToInt) {
  function TEST (line 310) | TEST(Avx512BitCast, IntToFloat) {
  function TEST (line 318) | TEST(Avx512BitCast, RoundTrip) {
  function TEST (line 344) | TEST(Avx512FloatTraits, ConditionalWithMask) {
  function TEST (line 356) | TEST(Avx512FloatTraits, ConditionalFromComparison) {
  function TEST (line 369) | TEST(Avx512FloatTraits, ConditionalInt32WithMask) {
  function TEST (line 380) | TEST(Avx512FloatTraits, ConditionalWithLaneWideMask) {
  function TEST (line 394) | TEST(Avx512FloatTraits, Apply) {
  function TEST (line 406) | TEST(Avx512FloatTraits, Fma) {
  function TEST (line 414) | TEST(Avx512FloatTraits, Sqrt) {
  function TEST (line 439) | TEST(Avx512FloatTraits, MinMax) {
  function TEST (line 452) | TEST(Avx512Util, FloorSmall) {
  function TEST (line 476) | TEST(Avx512Util, ConvertToInt) {
  function TEST (line 502) | TEST(Avx512Util, ConvertToInt_NaN) {
  function TEST (line 528) | TEST(Avx512Util, Gather) {
  function TEST (line 555) | TEST(Avx512Util, IntDivBy3) {
  function TEST (line 564) | TEST(Avx512Util, Signof) {
  function TEST (line 589) | TEST(Avx512Util, Signofi) {
  function TEST (line 602) | TEST(Avx512Util, Nonnormal) {
  function TEST (line 634) | TEST(Avx512Util, NonnormalOrZero) {
  function TEST (line 663) | TEST(Avx512Util, BoolAsOne) {
  function TEST (line 675) | TEST(Avx512Util, NboolAsOne) {
  function TEST (line 687) | TEST(Avx512Util, BoolAsMask) {
  function TEST (line 696) | TEST(Avx512Util, BoolApplyOrZero) {
  function TEST (line 720) | TEST(Avx512, Unavailable) {

FILE: tests/fast_math/avx_test.cpp
  function lane (line 25) | static float lane(__m256 v, int i) {
  function lane (line 32) | static int32_t lane(__m256i v, int i) {
  function __m256 (line 39) | static __m256 make8(float a, float b, float c, float d, float e, float f...
  function TEST (line 45) | TEST(AvxFloat, Broadcast) {
  function TEST (line 52) | TEST(AvxFloat, Arithmetic) {
  function TEST (line 75) | TEST(AvxFloat, Negation) {
  function TEST (line 89) | TEST(AvxFloat, CompoundAssignment) {
  function TEST (line 101) | TEST(AvxFloat, Comparisons) {
  function TEST (line 127) | TEST(AvxFloat, LogicalNot) {
  function TEST (line 136) | TEST(AvxInt32, BasicOps) {
  function TEST (line 146) | TEST(AvxInt32, Negation) {
  function TEST (line 154) | TEST(AvxInt32, ShiftOps) {
  function TEST (line 165) | TEST(AvxInt32, Comparisons) {
  function TEST (line 182) | TEST(AvxUint32, LogicalShift) {
  function TEST (line 190) | TEST(AvxUint32, UnsignedComparison) {
  function TEST (line 202) | TEST(AvxBitCast, FloatToInt) {
  function TEST (line 210) | TEST(AvxBitCast, IntToFloat) {
  function TEST (line 218) | TEST(AvxBitCast, RoundTrip) {
  function TEST (line 228) | TEST(AvxFloatTraits, Conditional) {
  function TEST (line 239) | TEST(AvxFloatTraits, ConditionalInt32) {
  function TEST (line 250) | TEST(AvxFloatTraits, Fma) {
  function TEST (line 258) | TEST(AvxFloatTraits, Sqrt) {
  function TEST (line 267) | TEST(AvxFloatTraits, MinMax) {
  function TEST (line 282) | TEST(AvxUtil, FloorSmall) {
  function TEST (line 291) | TEST(AvxUtil, ConvertToInt) {
  function TEST (line 301) | TEST(AvxUtil, ConvertToInt_NaN) {
  function TEST (line 319) | TEST(AvxUtil, Gather) {
  function TEST (line 333) | TEST(AvxUtil, IntDivBy3) {
  function TEST (line 342) | TEST(AvxUtil, Signof) {
  function TEST (line 351) | TEST(AvxUtil, Signofi) {
  function TEST (line 360) | TEST(AvxUtil, Nonnormal) {
  function TEST (line 384) | TEST(AvxFloat, NotAvailable) {

FILE: tests/fast_math/bivariate_ulp_eval.h
  function namespace (line 36) | namespace dispenso {

FILE: tests/fast_math/cbrt_test.cpp
  function TEST (line 22) | TEST(Cbrt, SpecialValues) {
  function TEST (line 34) | TEST(Cbrt, RangeNeg) {
  function TEST (line 43) | TEST(Cbrt, RangePos) {
  function TEST (line 52) | TEST(Cbrt, RangeSmall) {
  function TEST (line 65) | TEST(CbrtAccurate, RangeNeg) {
  function TEST (line 71) | TEST(CbrtAccurate, RangePos) {
  function TEST (line 77) | TEST(CbrtAccurate, RangeSmall) {
  function Flt (line 89) | Flt cbrt_max(Flt x) {

FILE: tests/fast_math/cos_test.cpp
  function gt_cos (line 21) | float gt_cos(float x) {
  function TEST (line 28) | TEST(Cos, SpecialValues) {
  function TEST (line 47) | TEST(Cos, RangePi) {
  function TEST (line 53) | TEST(Cos, Range128Pi) {
  function TEST (line 59) | TEST(Cos, Range1MPi) {
  function TEST (line 68) | TEST(CosLessAccurate, RangePi) {
  function TEST (line 75) | TEST(CosLessAccurate, Range128Pi) {
  function TEST (line 87) | TEST(CosLessAccurate, Range32768Pi) {
  function Flt (line 101) | Flt cos_max(Flt x) {

FILE: tests/fast_math/erf_test.cpp
  function gt_erf (line 17) | static float gt_erf(float x) {
  function TEST (line 23) | TEST(Erf, SpecialValues) {
  function TEST (line 33) | TEST(Erf, NaN) {
  function TEST (line 38) | TEST(Erf, Saturation) {
  function TEST (line 48) | TEST(Erf, NearZero) {

FILE: tests/fast_math/eval.cpp
  type dispenso (line 8) | namespace dispenso {
    type fast_math (line 9) | namespace fast_math {
      type detail (line 10) | namespace detail {

FILE: tests/fast_math/eval.h
  function namespace (line 33) | namespace dispenso {

FILE: tests/fast_math/exp10_test.cpp
  function groundTruth (line 17) | float groundTruth(float input) {
  function TEST (line 23) | TEST(Exp10, SpecialValues) {
  function TEST (line 33) | TEST(Exp10, Range) {
  function TEST (line 38) | TEST(Exp10LessAccurate, RangeMedium) {
  function Flt (line 46) | Flt exp10_max(Flt x) {

FILE: tests/fast_math/exp2_test.cpp
  function TEST (line 17) | TEST(Exp2, SpecialValues) {
  function TEST (line 35) | TEST(Exp2, RangeSmall) {
  function TEST (line 41) | TEST(Exp2, RangeMedium) {
  function TEST (line 47) | TEST(Exp2, RangeLarge) {

FILE: tests/fast_math/exp_test.cpp
  type BoundsOnlyTraits (line 17) | struct BoundsOnlyTraits {
  function TEST (line 24) | TEST(Exp, SpecialValues) {
  function TEST (line 34) | TEST(Exp, RangeSmall) {
  function TEST (line 39) | TEST(Exp, RangeSmallish) {
  function TEST (line 43) | TEST(Exp, RangeMedium) {
  function TEST (line 48) | TEST(Exp, RangeLarge) {
  function TEST (line 55) | TEST(ExpLessAccurateWBounds, SpecialValues) {
  function TEST (line 66) | TEST(ExpLessAccurateWBounds, Range_m100_100) {
  function TEST (line 71) | TEST(ExpLessAccurate, Range_m88_88) {
  function Flt (line 79) | Flt exp_max_fn(Flt x) {
  function Flt (line 84) | Flt exp_bounds_fn(Flt x) {

FILE: tests/fast_math/expm1_test.cpp
  function gt_expm1 (line 17) | static float gt_expm1(float x) {
  function TEST (line 21) | TEST(Expm1, SpecialValues) {
  function TEST (line 27) | TEST(Expm1, NearZero) {
  function TEST (line 38) | TEST(Expm1, RangeSmall) {
  function TEST (line 43) | TEST(Expm1, RangeMedium) {
  function TEST (line 48) | TEST(Expm1, RangeLarge) {

FILE: tests/fast_math/frexp_test.cpp
  function TEST (line 17) | TEST(Frexp, SpecialVals) {
  function TEST (line 30) | TEST(Frexp, RangeNeg) {
  function TEST (line 41) | TEST(Frexp, RangePos) {
  function checkFrexpSimd (line 80) | void checkFrexpSimd() {
  function TEST (line 126) | TEST(FrexpSse, SpecialVals) {
  function TEST (line 131) | TEST(FrexpAvx, SpecialVals) {
  function TEST (line 136) | TEST(FrexpAvx512, SpecialVals) {
  function TEST (line 141) | TEST(FrexpNeon, SpecialVals) {
  function TEST (line 146) | TEST(FrexpHwy, SpecialVals) {

FILE: tests/fast_math/hwy_test.cpp
  function N (line 33) | static size_t N() {
  function lane (line 40) | static float lane(HwyVecF v, size_t i) {
  function lane (line 45) | static float lane(HwyFloat v, size_t i) {
  function lane (line 49) | static int32_t lane(HwyVecI v, size_t i) {
  function lane (line 54) | static int32_t lane(HwyInt32 v, size_t i) {
  function lane (line 58) | static uint32_t lane(HwyVecU v, size_t i) {
  function lane (line 63) | static uint32_t lane(HwyUint32 v, size_t i) {
  function HwyVecF (line 68) | static HwyVecF loadF(const float* vals) {
  function HwyVecI (line 73) | static HwyVecI loadI(const int32_t* vals) {
  function TEST (line 79) | TEST(HwyFloat, Broadcast) {
  function TEST (line 86) | TEST(HwyFloat, Arithmetic) {
  function TEST (line 110) | TEST(HwyFloat, Negation) {
  function TEST (line 130) | TEST(HwyFloat, CompoundAssignment) {
  function TEST (line 155) | TEST(HwyFloat, Comparisons) {
  function TEST (line 184) | TEST(HwyFloat, LogicalNot) {
  function TEST (line 197) | TEST(HwyInt32, ArithmeticAndShifts) {
  function TEST (line 226) | TEST(HwyInt32, Negation) {
  function TEST (line 234) | TEST(HwyInt32, Bitwise) {
  function TEST (line 251) | TEST(HwyInt32, Comparisons) {
  function TEST (line 270) | TEST(HwyUint32, ArithmeticAndShifts) {
  function TEST (line 291) | TEST(HwyUint32, UnsignedComparisons) {
  function TEST (line 306) | TEST(HwyBitCast, FloatIntRoundTrip) {
  function TEST (line 317) | TEST(HwyBitCast, FloatUintRoundTrip) {
  function TEST (line 330) | TEST(HwyFloatTraits, Conditional) {
  function TEST (line 351) | TEST(HwyFloatTraits, ConditionalInt32) {
  function TEST (line 370) | TEST(HwyFloatTraits, ConditionalWithLaneWideMask) {
  function TEST (line 390) | TEST(HwyFloatTraits, Apply) {
  function TEST (line 409) | TEST(HwyFloatTraits, FmaAndSqrt) {
  function TEST (line 429) | TEST(HwyFloatTraits, MinMax) {
  function TEST (line 452) | TEST(HwyUtil, FloorSmall) {
  function TEST (line 466) | TEST(HwyUtil, ConvertToInt) {
  function TEST (line 481) | TEST(HwyUtil, ConvertToIntNaN) {
  function TEST (line 489) | TEST(HwyUtil, Gather) {
  function TEST (line 507) | TEST(HwyUtil, IntDivBy3) {
  function TEST (line 522) | TEST(HwyUtil, Signof) {
  function TEST (line 537) | TEST(HwyUtil, Signofi) {
  function TEST (line 552) | TEST(HwyUtil, Nonnormal) {
  function TEST (line 568) | TEST(HwyUtil, BoolAsOne) {
  function TEST (line 581) | TEST(HwyUtil, NBoolAsOne) {
  function TEST (line 594) | TEST(HwyUtil, BoolAsMask) {
  function TEST (line 603) | TEST(HwyUtil, BoolApplyOrZero) {
  function TEST (line 620) | TEST(HwyUtil, ClampAllowNan) {
  function TEST (line 638) | TEST(HwyUtil, ClampNoNan) {
  function TEST (line 661) | TEST(HwyFloat, Unavailable) {

FILE: tests/fast_math/hypot_test.cpp
  function hypotRef (line 22) | static float hypotRef(float x, float y) {
  function TEST (line 28) | TEST(Hypot, SpecialValues) {
  function TEST (line 38) | TEST(Hypot, GridNearZero) {
  function TEST (line 58) | TEST(Hypot, RandomNormal) {
  function TEST (line 72) | TEST(Hypot, RandomWide) {
  function TEST (line 86) | TEST(Hypot, RandomTiny) {
  function TEST (line 100) | TEST(Hypot, Symmetry) {
  function TEST (line 114) | TEST(Hypot, DiagonalSweep) {
  function hypotBounds (line 125) | static float hypotBounds(Args... args) {
  function TEST (line 129) | TEST(HypotBounds, InfFinite) {
  function TEST (line 140) | TEST(HypotBounds, InfNaN) {
  function TEST (line 154) | TEST(HypotBounds, NaNFinite) {
  function Flt (line 164) | Flt hypot_max(Flt x, Flt y) {

FILE: tests/fast_math/ldexp_test.cpp
  function TEST (line 17) | TEST(Ldexp, SpecialVals) {
  function TEST (line 34) | TEST(Ldexp, Range) {
  function checkLdexpSimd (line 66) | void checkLdexpSimd() {
  function TEST (line 105) | TEST(LdexpSse, RoundTrip) {
  function TEST (line 110) | TEST(LdexpAvx, RoundTrip) {
  function TEST (line 115) | TEST(LdexpAvx512, RoundTrip) {
  function TEST (line 120) | TEST(LdexpNeon, RoundTrip) {
  function TEST (line 125) | TEST(LdexpHwy, RoundTrip) {

FILE: tests/fast_math/log10_test.cpp
  function TEST (line 19) | TEST(Log10, SpecialValues) {
  function TEST (line 29) | TEST(Log10Accurate, RangeNeg) {
  function TEST (line 34) | TEST(Log10Accurate, RangePos) {
  function TEST (line 39) | TEST(Log10, RangeNeg) {
  function TEST (line 45) | TEST(Log10, RangePos) {
  function Flt (line 53) | Flt log10_max(Flt x) {

FILE: tests/fast_math/log1p_test.cpp
  function gt_log1p (line 17) | static float gt_log1p(float x) {
  function TEST (line 21) | TEST(Log1p, SpecialValues) {
  function TEST (line 27) | TEST(Log1p, NearZero) {
  function TEST (line 38) | TEST(Log1p, RangeSmall) {
  function TEST (line 43) | TEST(Log1p, RangeMedium) {
  function TEST (line 48) | TEST(Log1p, RangeLarge) {

FILE: tests/fast_math/log2_test.cpp
  function groundTruth (line 20) | float groundTruth(float x) {
  function TEST (line 27) | TEST(Log2, SpecialValues) {
  function TEST (line 42) | TEST(Log2WBounds, RangeNeg) {
  function TEST (line 47) | TEST(Log2WBounds, RangePos) {
  function TEST (line 52) | TEST(Log2, RangeNeg) {
  function TEST (line 58) | TEST(Log2, RangePos) {
  function Flt (line 66) | Flt log2_max(Flt x) {

FILE: tests/fast_math/log_test.cpp
  function TEST (line 19) | TEST(Log, SpecialValues) {
  function TEST (line 29) | TEST(LogAccurate, RangeNeg) {
  function TEST (line 34) | TEST(LogAccurate, RangePos) {
  function TEST (line 39) | TEST(Log, RangeNeg) {
  function TEST (line 45) | TEST(Log, RangePos) {
  function Flt (line 52) | Flt log_max(Flt x) {

FILE: tests/fast_math/neon_test.cpp
  function lane (line 27) | static float lane(float32x4_t v, int i) {
  function lane (line 32) | static float lane(NeonFloat v, int i) {
  function lane (line 36) | static int32_t lane(int32x4_t v, int i) {
  function lane (line 41) | static int32_t lane(NeonInt32 v, int i) {
  function lane (line 45) | static uint32_t lane(uint32x4_t v, int i) {
  function lane (line 50) | static uint32_t lane(NeonUint32 v, int i) {
  function float32x4_t (line 56) | static float32x4_t make4(float a, float b, float c, float d) {
  function int32x4_t (line 61) | static int32x4_t makeInt4(int32_t a, int32_t b, int32_t c, int32_t d) {
  function TEST (line 68) | TEST(NeonFloat, Arithmetic) {
  function TEST (line 78) | TEST(NeonFloat, Negation) {
  function TEST (line 87) | TEST(NeonFloat, CompoundAssignment) {
  function TEST (line 100) | TEST(NeonInt32, Arithmetic) {
  function TEST (line 109) | TEST(NeonInt32, Negation) {
  function TEST (line 118) | TEST(NeonInt32, Shifts) {
  function TEST (line 131) | TEST(NeonUint32, Shifts) {
  function TEST (line 146) | TEST(NeonFloat, Comparisons) {
  function TEST (line 157) | TEST(NeonInt32, Comparisons) {
  function TEST (line 174) | TEST(NeonUint32, UnsignedComparisons) {
  function TEST (line 186) | TEST(NeonBitCast, FloatIntRoundTrip) {
  function TEST (line 195) | TEST(NeonBitCast, FloatUintRoundTrip) {
  function TEST (line 204) | TEST(NeonBitCast, IntUintRoundTrip) {
  function TEST (line 215) | TEST(NeonFloatTraits, ConditionalWithMask) {
  function TEST (line 228) | TEST(NeonFloatTraits, ConditionalFromComparison) {
  function TEST (line 239) | TEST(NeonFloatTraits, ConditionalInt32WithMask) {
  function TEST (line 251) | TEST(NeonFloatTraits, ConditionalWithLaneWideMask) {
  function TEST (line 263) | TEST(NeonFloatTraits, Apply) {
  function TEST (line 274) | TEST(NeonFloatTraits, Fma) {
  function TEST (line 282) | TEST(NeonFloatTraits, Sqrt) {
  function TEST (line 291) | TEST(NeonFloatTraits, MinMax) {
  function TEST (line 308) | TEST(NeonUtil, FloorSmall) {
  function TEST (line 317) | TEST(NeonUtil, ConvertToInt) {
  function TEST (line 327) | TEST(NeonUtil, ConvertToIntNaN) {
  function TEST (line 338) | TEST(NeonUtil, Gather) {
  function TEST (line 348) | TEST(NeonUtil, IntDivBy3) {
  function TEST (line 357) | TEST(NeonUtil, Signof) {
  function TEST (line 366) | TEST(NeonUtil, Signofi) {
  function TEST (line 375) | TEST(NeonUtil, Nonnormal) {
  function TEST (line 386) | TEST(NeonUtil, NonnormalOrZero) {
  function TEST (line 398) | TEST(NeonUtil, BoolAsOne) {
  function TEST (line 412) | TEST(NeonUtil, NboolAsOne) {
  function TEST (line 426) | TEST(NeonUtil, BoolAsMask) {
  function TEST (line 434) | TEST(NeonUtil, BoolApplyOrZero) {
  function TEST (line 444) | TEST(NeonUtil, ClampAllowNan) {
  function TEST (line 462) | TEST(NeonUtil, ClampNoNan) {

FILE: tests/fast_math/pow_test.cpp
  function gt_pow (line 21) | static float gt_pow(float x, float y) {
  function TEST (line 27) | TEST(Pow, SpecialValues) {
  function TEST (line 36) | TEST(Pow, ExactIntegerPowers) {
  function TEST (line 54) | TEST(Pow, RandomModerate) {
  function TEST (line 76) | TEST(Pow, RandomWide) {
  function TEST (line 97) | TEST(Pow, RandomSmallExp) {
  function TEST (line 120) | TEST(Pow, NegativeBaseIntegerExp) {
  function TEST (line 147) | TEST(Pow, NegativeBaseNonIntegerExp) {
  function TEST (line 161) | TEST(Pow, NegativeBaseRandomInteger) {
  function TEST (line 186) | TEST(PowBounds, YZero) {
  function TEST (line 198) | TEST(PowBounds, XOne) {
  function TEST (line 210) | TEST(PowBounds, NegOneInf) {
  function TEST (line 218) | TEST(PowBounds, ZeroPosExp) {
  function TEST (line 235) | TEST(PowBounds, ZeroNegExp) {
  function TEST (line 246) | TEST(PowBounds, InfPosExp) {
  function TEST (line 257) | TEST(PowBounds, InfNegExp) {
  function TEST (line 273) | TEST(PowBounds, AbsXInf) {
  function TEST (line 283) | TEST(PowBounds, NaNPropagation) {
  function TEST (line 293) | TEST(PowBounds, Subnormal) {
  function TEST (line 321) | TEST(PowAccurate, RandomModerate) {
  function TEST (line 342) | TEST(PowDouble, RandomModerate) {
  function TEST (line 363) | TEST(PowDouble, RandomWide) {
  function Flt (line 387) | Flt pow_default(Flt x, Flt y) {
  function Flt (line 392) | Flt pow_max(Flt x, Flt y) {

FILE: tests/fast_math/pow_ulp_eval.cpp
  function gt_pow (line 23) | static float gt_pow(float x, float y) {
  function pow_scalar (line 41) | static void pow_scalar(const float* xs, const float* ys, float* out, int...
  function pow_scalar_accurate (line 46) | static void pow_scalar_accurate(const float* xs, const float* ys, float*...
  function pow_sse (line 52) | static void pow_sse(const float* xs, const float* ys, float* out, int32_...
  function pow_sse_accurate (line 61) | static void pow_sse_accurate(const float* xs, const float* ys, float* ou...
  function pow_avx (line 72) | static void pow_avx(const float* xs, const float* ys, float* out, int32_...
  function pow_avx_accurate (line 81) | static void pow_avx_accurate(const float* xs, const float* ys, float* ou...
  function pow_avx512 (line 92) | static void pow_avx512(const float* xs, const float* ys, float* out, int...
  function pow_avx512_accurate (line 101) | static void pow_avx512_accurate(const float* xs, const float* ys, float*...
  function main (line 113) | int main(int argc, char** argv) {

FILE: tests/fast_math/simd_test_utils.h
  function namespace (line 40) | namespace dispenso {

FILE: tests/fast_math/sin_test.cpp
  function gt_sin (line 20) | float gt_sin(float x) {
  function TEST (line 27) | TEST(Sin, SpecialValues) {
  function TEST (line 44) | TEST(Sin, RangePi) {
  function TEST (line 50) | TEST(Sin, Range128Pi) {
  function TEST (line 56) | TEST(Sin, Range1MPi) {
  function TEST (line 65) | TEST(SinLessAccurate, RangePi) {
  function TEST (line 72) | TEST(SinLessAccurate, Range128Pi) {
  function TEST (line 84) | TEST(SinLessAccurate, Range32768Pi) {
  function Flt (line 98) | Flt sin_max(Flt x) {

FILE: tests/fast_math/sincos_test.cpp
  function TEST (line 16) | TEST(SinCos, BasicValues) {
  function TEST (line 35) | TEST(SinCos, MatchesSeparateCalls) {
  function TEST (line 51) | TEST(SinCos, AccuracyVsLibc) {
  function TEST (line 72) | TEST(SinCos, MaxAccuracy) {

FILE: tests/fast_math/sinpi_test.cpp
  function gt_sinpi (line 18) | static float gt_sinpi(float x) {
  function gt_cospi (line 48) | static float gt_cospi(float x) {
  function TEST (line 79) | TEST(SinPi, ExactValues) {
  function TEST (line 94) | TEST(SinPi, AccuracyVsLibc) {
  function TEST (line 107) | TEST(CosPi, ExactValues) {
  function TEST (line 120) | TEST(CosPi, AccuracyVsLibc) {
  function TEST (line 133) | TEST(SinCosPi, MatchesSeparateCalls) {
  function TEST (line 144) | TEST(SinCosPi, AccuracyVsLibc) {
  function TEST (line 157) | TEST(SinPi, LargeArguments) {
  function TEST (line 164) | TEST(CosPi, LargeArguments) {

FILE: tests/fast_math/sse_test.cpp
  function lane (line 20) | static float lane(__m128 v, int i) {
  function lane (line 27) | static int32_t lane(__m128i v, int i) {
  function __m128 (line 34) | static __m128 make4(float a, float b, float c, float d) {
  function TEST (line 40) | TEST(SseFloat, Broadcast) {
  function TEST (line 47) | TEST(SseFloat, Arithmetic) {
  function TEST (line 68) | TEST(SseFloat, Negation) {
  function TEST (line 79) | TEST(SseFloat, Comparisons) {
  function TEST (line 92) | TEST(SseInt32, BasicOps) {
  function TEST (line 100) | TEST(SseInt32, ShiftOps) {
  function TEST (line 111) | TEST(SseUint32, LogicalShift) {
  function TEST (line 119) | TEST(SseBitCast, FloatToInt) {
  function TEST (line 125) | TEST(SseBitCast, IntToFloat) {
  function TEST (line 131) | TEST(SseBitCast, RoundTrip) {
  function TEST (line 141) | TEST(SseFloatTraits, Conditional) {
  function TEST (line 152) | TEST(SseFloatTraits, Fma) {
  function TEST (line 158) | TEST(SseFloatTraits, MinMax) {
  function TEST (line 171) | TEST(SseUtil, FloorSmall) {
  function TEST (line 180) | TEST(SseUtil, ConvertToInt) {
  function TEST (line 189) | TEST(SseUtil, Gather) {
  function TEST (line 199) | TEST(SseUtil, IntDivBy3) {
  function TEST (line 208) | TEST(SseUtil, Signof) {
  function TEST (line 217) | TEST(SseUtil, Signofi) {
  function TEST (line 229) | TEST(SseFloat, NotAvailable) {

FILE: tests/fast_math/tan_test.cpp
  function TEST (line 17) | TEST(Tan, SpecialValues) {
  function TEST (line 30) | TEST(TanLessAccurate, Range8Pi) {
  function TEST (line 37) | TEST(TanLessAccurate, Range1KPi) {
  function TEST (line 47) | TEST(TanLessAccurate, Range32KPi) {
  function TEST (line 59) | TEST(TanAccurate, Range2MPi) {
  function Flt (line 71) | Flt tan_max(Flt x) {

FILE: tests/fast_math/tanh_test.cpp
  function gt_tanh (line 17) | static float gt_tanh(float x) {
  function TEST (line 21) | TEST(Tanh, SpecialValues) {
  function TEST (line 27) | TEST(Tanh, Saturation) {
  function TEST (line 35) | TEST(Tanh, NearZero) {
  function TEST (line 46) | TEST(Tanh, RangeSmall) {
  function TEST (line 51) | TEST(Tanh, RangeFull) {

FILE: tests/fast_math/test_main.cpp
  function isRosetta (line 20) | static bool isRosetta() {
  class RosettaGuard (line 26) | class RosettaGuard : public testing::Environment {
    method SetUp (line 28) | void SetUp() override {

FILE: tests/fast_math/ulp_eval.cpp
  type Band (line 27) | struct Band {
  function evalBands (line 83) | static void evalBands(const char* name, FnType gt, FnType fn, Band* band...
  function eval (line 135) | static void eval(const char* name, FnType gt, FnType fn, Band (&bands)[N...
  function gt_sin (line 140) | static float gt_sin(float x) {
  function gt_cos (line 147) | static float gt_cos(float x) {
  function gt_tan (line 154) | static float gt_tan(float x) {
  function gt_asin (line 157) | static float gt_asin(float x) {
  function gt_acos (line 160) | static float gt_acos(float x) {
  function gt_atan (line 163) | static float gt_atan(float x) {
  function gt_exp (line 166) | static float gt_exp(float x) {
  function gt_exp2 (line 169) | static float gt_exp2(float x) {
  function gt_log (line 172) | static float gt_log(float x) {
  function gt_log2 (line 175) | static float gt_log2(float x) {
  function gt_log10 (line 178) | static float gt_log10(float x) {
  function gt_cbrt (line 181) | static float gt_cbrt(float x) {
  function gt_expm1 (line 184) | static float gt_expm1(float x) {
  function gt_log1p (line 187) | static float gt_log1p(float x) {
  function gt_tanh (line 190) | static float gt_tanh(float x) {
  function gt_erf (line 193) | static float gt_erf(float x) {
  function main (line 247) | int main() {

FILE: tests/fast_math/util_test.cpp
  function TEST (line 19) | TEST(BitCast, FloatToUint) {
  function TEST (line 26) | TEST(BitCast, UintToFloat) {
  function TEST (line 31) | TEST(BitCast, RoundTrip) {
  function TEST (line 41) | TEST(FloatDistance, IdenticalValues) {
  function TEST (line 47) | TEST(FloatDistance, OneUlp) {
  function TEST (line 54) | TEST(FloatDistance, SymmetricForPositive) {
  function TEST (line 60) | TEST(FloatDistance, DenormalsAreZero) {
  function TEST (line 68) | TEST(Signof, PositiveValues) {
  function TEST (line 74) | TEST(Signof, NegativeValues) {
  function TEST (line 80) | TEST(Signof, Zero) {
  function TEST (line 88) | TEST(Signofi, PositiveValues) {
  function TEST (line 93) | TEST(Signofi, NegativeValues) {
  function TEST (line 98) | TEST(Signofi, Zero) {
  function TEST (line 104) | TEST(Nonnormal, IntVersion) {
  function TEST (line 121) | TEST(Nonnormal, FloatVersion) {
  function TEST (line 131) | TEST(NonnormalOrZero, Normal) {
  function TEST (line 138) | TEST(NonnormalOrZero, Zero) {
  function TEST (line 143) | TEST(NonnormalOrZero, Inf) {
  function TEST (line 148) | TEST(NonnormalOrZero, NaN) {
  function TEST (line 155) | TEST(ConvertToInt, BasicValues) {
  function TEST (line 163) | TEST(ConvertToInt, Rounding) {
  function TEST (line 178) | TEST(ConvertToInt, LargeValues) {
  function TEST (line 186) | TEST(ConvertToIntClamped, WithinRange) {
  function TEST (line 192) | TEST(ConvertToIntClamped, AtBounds) {
  function TEST (line 197) | TEST(ConvertToIntClamped, ClampedAbove) {
  function TEST (line 201) | TEST(ConvertToIntClamped, ClampedBelow) {
  function TEST (line 207) | TEST(FloorSmall, Integers) {
  function TEST (line 214) | TEST(FloorSmall, PositiveFractions) {
  function TEST (line 221) | TEST(FloorSmall, NegativeFractions) {
  function TEST (line 228) | TEST(FloorSmall, MatchesStdFloor) {
  function TEST (line 252) | TEST(Min, BasicValues) {
  function TEST (line 259) | TEST(Min, NanBehavior) {
  function TEST (line 267) | TEST(ClampAllowNan, WithinRange) {
  function TEST (line 271) | TEST(ClampAllowNan, BelowMin) {
  function TEST (line 275) | TEST(ClampAllowNan, AboveMax) {
  function TEST (line 279) | TEST(ClampAllowNan, NanInput) {
  function TEST (line 289) | TEST(ClampNoNan, WithinRange) {
  function TEST (line 293) | TEST(ClampNoNan, BelowMin) {
  function TEST (line 297) | TEST(ClampNoNan, AboveMax) {
  function TEST (line 301) | TEST(ClampNoNan, AtBounds) {
  function TEST (line 308) | TEST(Gather, BasicLookup) {
  function TEST (line 317) | TEST(NboolAsOne, Float) {
  function TEST (line 322) | TEST(NboolAsOne, Int) {
  function TEST (line 327) | TEST(BoolAsOne, Float) {
  function TEST (line 332) | TEST(BoolAsOne, Int) {
  function TEST (line 339) | TEST(BoolAsMask, TrueGivesAllOnes) {
  function TEST (line 344) | TEST(BoolAsMask, FalseGivesZero) {
  function TEST (line 351) | TEST(BoolApplyOrZero, TrueReturnsValue) {
  function TEST (line 356) | TEST(BoolApplyOrZero, FalseReturnsZero) {
  function TEST (line 361) | TEST(BoolApplyOrZero, FloatTrue) {
  function TEST (line 366) | TEST(BoolApplyOrZero, FloatFalse) {
  function TEST (line 373) | TEST(IntDivBy3, ExactMultiples) {
  function TEST (line 382) | TEST(IntDivBy3, WithRemainder) {
  function TEST (line 393) | TEST(IntDivBy3, LargeValues) {

FILE: tests/for_each_test.cpp
  function forEachTestHelper (line 22) | void forEachTestHelper() {
  function TEST (line 40) | TEST(ForEach, Vector) {
  function TEST (line 44) | TEST(ForEach, List) {
  function TEST (line 48) | TEST(ForEach, Deque) {
  function TEST (line 52) | TEST(ForEach, Set) {
  function TEST (line 70) | TEST(ForEach, UnorderedMap) {
  function forEachNTestHelper (line 89) | void forEachNTestHelper() {
  function TEST (line 107) | TEST(ForEachN, Vector) {
  function TEST (line 111) | TEST(ForEachN, List) {
  function TEST (line 115) | TEST(ForEachN, Deque) {
  function TEST (line 119) | TEST(ForEachN, Set) {
  function TEST (line 137) | TEST(ForEachN, UnorderedMap) {
  function TEST (line 155) | TEST(ForEach, NoWaitFewerThreads) {
  function TEST (line 190) | TEST(ForEachN, NoWaitFewerThreads) {
  function TEST (line 225) | TEST(ForEach, SmallSet) {
  function TEST (line 243) | TEST(ForEach, EmptySet) {
  function TEST (line 250) | TEST(ForEach, References) {
  function TEST (line 263) | TEST(ForEach, Cascade) {
  function testMaxThreads (line 297) | static void testMaxThreads(size_t poolSize, uint32_t maxThreads, bool te...
  function TEST (line 341) | TEST(ForEach, OptionsMaxThreadsBigPoolBlocking) {
  function TEST (line 346) | TEST(ForEach, OptionsMaxThreadsBigPoolNonBlocking) {
  function TEST (line 351) | TEST(ForEach, OptionsMaxThreadsSmallPoolBlocking) {
  function TEST (line 356) | TEST(ForEach, OptionsMaxThreadsSmallPoolNonBlocking) {
  function TEST (line 361) | TEST(ForEach, OptionsMaxThreadsSerialBlocking) {
  function TEST (line 366) | TEST(ForEach, OptionsMaxThreadsSerialNonBlocking) {
  type NoDefaultCtorIterator (line 373) | struct NoDefaultCtorIterator {
    method NoDefaultCtorIterator (line 380) | explicit NoDefaultCtorIterator(const std::vector<size_t>& v, size_t id...
    method reference (line 382) | reference operator*() const {
    method NoDefaultCtorIterator (line 385) | NoDefaultCtorIterator& operator++() {
    method NoDefaultCtorIterator (line 389) | NoDefaultCtorIterator operator++(int) {
  function TEST (line 406) | TEST(ForEach, NonDefaultConstructibleIterator) {
  function TEST (line 423) | TEST(ForEachN, NonDefaultConstructibleIterator) {

FILE: tests/future_test.cpp
  function TEST (line 12) | TEST(Future, Invalid) {
  function TEST (line 19) | TEST(Future, MakeReady) {
  function TEST (line 38) | TEST(Future, ThreadPool) {
  function TEST (line 59) | TEST(Future, ThreadPoolForceQueuing) {
  function TEST (line 82) | TEST(Future, TaskSet) {
  function TEST (line 103) | TEST(Future, TaskSetForceQueuing) {
  function TEST (line 124) | TEST(Future, TaskSetWaitImpliesImmediatelyAvailable) {
  function TEST (line 153) | TEST(Future, ConcurrentTaskSetWaitImpliesImmediatelyAvailable) {
  function TEST (line 182) | TEST(Future, LongRunMultipleWaitFor) {
  function TEST (line 208) | TEST(Future, ShareInnerScopeWaits) {
  function TEST (line 222) | TEST(Future, BasicLoop) {
  function TEST (line 238) | TEST(Future, CheckBackwards) {
  function TEST (line 252) | TEST(Future, Async) {
  function TEST (line 265) | TEST(Future, AsyncNotDeferred) {
  function TEST (line 278) | TEST(Future, AsyncNotAsync) {
  function TEST (line 291) | TEST(Future, AsyncSpecifyThreadPool) {
  function TEST (line 305) | TEST(Future, AsyncNotDeferredSpecifyThreadPool) {
  function TEST (line 320) | TEST(Future, AsyncNotAsyncSpecifyThreadPool) {
  function TEST (line 334) | TEST(Future, AsyncSpecifyNewThread) {
  function TEST (line 348) | TEST(Future, AsyncNotDeferredSpecifyNewThread) {
  function TEST (line 364) | TEST(Future, AsyncNotAsyncSpecifyNewThread) {
  function TEST (line 380) | TEST(Future, AsyncSpecifyTaskSet) {
  function TEST (line 397) | TEST(Future, AsyncNotDeferredSpecifyTaskSet) {
  function TEST (line 415) | TEST(Future, AsyncNotAsyncSpecifyTaskSet) {
  function TEST (line 433) | TEST(Future, AsyncSpecifyConcurrentTaskSet) {
  function TEST (line 450) | TEST(Future, AsyncNotDeferredSpecifyConcurrentTaskSet) {
  function TEST (line 468) | TEST(Future, AsyncNotAsyncSpecifyConcurrentTaskSet) {
  type Node (line 486) | struct Node {
  function validateTree (line 491) | static void validateTree(Node* node, int val, int depth) {
  function buildTree (line 502) | static void buildTree(Node* node, int depth) {
  function TEST (line 518) | TEST(Future, RecursivelyBuildTree) {
  function TEST (line 526) | TEST(Future, BasicThenUsage) {
  function TEST (line 561) | TEST(Future, LongerThenChain) {
  function TEST (line 583) | TEST(Future, MultiThenReadyAllInline) {
  function TEST (line 595) | TEST(Future, MultiThenReadyDelayedOrigin) {
  function TEST (line 610) | TEST(Future, MultiThenReadyDelayedOriginNotImmediateThen) {
  function TEST (line 623) | TEST(Future, MultiThenReadyDelayedOriginTightLoop) {
  function TEST (line 653) | TEST(Future, ImmediateInvoker) {
  function TEST (line 665) | TEST(Future, NewThreadInvoker) {
  function TEST (line 673) | TEST(Future, SimpleExceptions) {
  function TEST (line 688) | TEST(Future, SimpleExceptionsReference) {
  function TEST (line 704) | TEST(Future, SimpleExceptionsVoid) {
  function TEST (line 718) | TEST(Future, ThenExceptions) {
  type SomeType (line 735) | struct SomeType {
    method SomeType (line 736) | SomeType(int init) : ptr(new int(init)) {}
    method SomeType (line 738) | SomeType(const SomeType& oth) : ptr(new int(*oth.ptr)) {}
    method SomeType (line 740) | SomeType& operator=(const SomeType& oth) {
  function TEST (line 753) | TEST(Future, ExceptionShouldntDestroyResultIfNotCreated) {
  function TEST (line 775) | TEST(Future, WhenAllEmptyVector) {
  function TEST (line 786) | TEST(Future, WhenAllVector) {
  function TEST (line 811) | TEST(Future, WhenAllEmptyTuple) {
  function TEST (line 817) | TEST(Future, WhenAllTuple) {
  function nodeMove (line 843) | inline std::unique_ptr<Node> nodeMove(const std::unique_ptr<Node>& curre...
  function makeTree (line 847) | static dispenso::Future<std::unique_ptr<Node>> makeTree(
  function fillVector (line 871) | static void fillVector(std::unique_ptr<Node>& node, std::vector<uint32_t...
  function TEST (line 886) | TEST(Future, WhenAllTreeBuild) {
  function makeTreeIters (line 901) | static dispenso::Future<std::unique_ptr<Node>> makeTreeIters(
  function TEST (line 929) | TEST(Future, WhenAllTreeBuildIters) {
  function TEST (line 945) | TEST(Future, TaskSetWaitImpliesFinished) {
  function TEST (line 971) | TEST(Future, TaskSetWaitImpliesWhenAllFinished) {
  function TEST (line 994) | TEST(Future, ConcurrentTaskSetWaitImpliesWhenAllFinished) {
  function TEST (line 1017) | TEST(Future, Copy) {
  function TEST (line 1050) | TEST(Future, Move) {
  function TEST (line 1100) | TEST(Future, ThenRefCountImmediate) {
  function TEST (line 1120) | TEST(Future, MoveThenRefCountImmediate) {
  function TEST (line 1140) | TEST(Future, ThenRefCountThreadPool) {
  function TEST (line 1168) | TEST(Future, MoveThenRefCountThreadPool) {

FILE: tests/graph_test.cpp
  function TEST (line 15) | TEST(Graph, Simple) {
  type EvalMode (line 60) | enum class EvalMode : uint8_t { singleThread, parallelFor, concurrentTas...
  function modeName (line 63) | std::string modeName(const testing::TestParamInfo<typename T::ParamType>...
  class Executor (line 68) | class Executor {
    method Executor (line 70) | Executor()
  type SingleThreadMode (line 92) | struct SingleThreadMode {
  type ParallelForMode (line 95) | struct ParallelForMode {
  type ConcurrentTaskSetMode (line 98) | struct ConcurrentTaskSetMode {
  class TwoSubgraphs (line 102) | class TwoSubgraphs : public testing::TestWithParam<EvalMode> {
    method SetUp (line 104) | void SetUp() override {
    method evaluateGraph (line 131) | void evaluateGraph(const dispenso::Graph& graph) {
  function TEST_P (line 143) | TEST_P(TwoSubgraphs, ReplaceSourceGraph) {
  function TEST_P (line 168) | TEST_P(TwoSubgraphs, ReplaceMiddleGraph) {
  function TEST_P (line 189) | TEST_P(TwoSubgraphs, ReplaceBothGraphs) {
  function TEST_P (line 214) | TEST_P(TwoSubgraphs, PartialEvaluation) {
  class BiPropGraphTest (line 276) | class BiPropGraphTest : public testing::TestWithParam<EvalMode> {
    method SetUp (line 278) | void SetUp() override {
    method checkResults (line 326) | void checkResults() {
  function TEST_P (line 340) | TEST_P(BiPropGraphTest, SimpleEvaluation) {
  class BigTree (line 396) | class BigTree : public testing::Test {
    method sizeOfLevel (line 398) | static size_t sizeOfLevel(size_t level) {
    method SetUp (line 401) | void SetUp() override {
    method rebuildLevel (line 419) | void rebuildLevel(size_t level) {
    method buildLevel (line 438) | void buildLevel(size_t level) {
  function TYPED_TEST (line 486) | TYPED_TEST(BigTree, FullAndPartialEvaluation) {
  function TYPED_TEST (line 523) | TYPED_TEST(BigTree, SubgraphClearAndRebuild) {
  function TEST (line 554) | TEST(BiPropNode, SetUnionMerge) {
  function TEST (line 597) | TEST(BiPropNode, ExistingSetToNew) {
  function TEST (line 623) | TEST(BiPropNode, NewToExistingSet) {
  function TEST (line 649) | TEST(Subgraph, AccessorsBasic) {
  function TEST (line 688) | TEST(Subgraph, MoveConstructor) {
  function TEST (line 710) | TEST(Graph, NodeAccessors) {
  function TEST (line 741) | TEST(Graph, SubgraphAccessors) {
  function TEST (line 764) | TEST(Graph, ClearMethod) {
  type GraphTestException (line 790) | struct GraphTestException : std::runtime_error {
    method GraphTestException (line 791) | GraphTestException() : std::runtime_error("graph test exception") {}
  function TEST (line 794) | TEST(GraphExceptionSafety, SingleThreadExecutor) {
  function TEST (line 820) | TEST(GraphExceptionSafety, ParallelForExecutor) {
  function TEST (line 855) | TEST(GraphExceptionSafety, ConcurrentTaskSetExecutor) {
  function TEST (line 889) | TEST(GraphExceptionSafety, ConcurrentTaskSetExecutor_InlineContinuation) {
  function TEST (line 931) | TEST(GraphExceptionSafety, ConcurrentTaskSetExecutor_DeepGraphException) {
  function TEST (line 1001) | TEST(GraphDepthGuard, ConcurrentTaskSetExecutor_DeepGraph) {

FILE: tests/greedy_for_ranges_test.cpp
  function simpleInner (line 17) | static void simpleInner(int w, int y, const std::vector<int>& image, std...
  function TEST (line 26) | TEST(GreedyForRanges, SimpleLoop) {
  function TEST (line 42) | TEST(GreedyForRanges, ShouldNotInvokeIfEmptyRange) {
  function loopWithStateImpl (line 56) | void loopWithStateImpl(dispenso::ThreadPool& pool = dispenso::globalThre...
  function TEST (line 88) | TEST(GreedyForRanges, LoopWithDequeState) {
  function TEST (line 91) | TEST(GreedyForRanges, LoopWithVectorState) {
  function TEST (line 94) | TEST(GreedyForRanges, LoopWithListState) {
  function TEST (line 98) | TEST(GreedyForRanges, ConcurrentLoopNoCoordination) {
  function TEST (line 128) | TEST(GreedyForRanges, CoordinatedLoops) {
  function concurrentLoop (line 162) | static void concurrentLoop(
  function TEST (line 182) | TEST(GreedyForRanges, CoordinatedConcurrentLoops) {
  function testMaxThreads (line 214) | static void
  function TEST (line 258) | TEST(GreedyForRanges, OptionsMaxThreadsBigPoolStaticChunkingBlocking) {
  function TEST (line 264) | TEST(GreedyForRanges, OptionsMaxThreadsBigPoolStaticChunkingNonBlocking) {
  function TEST (line 270) | TEST(GreedyForRanges, OptionsMaxThreadsBigPoolAutoChunkingBlocking) {
  function TEST (line 276) | TEST(GreedyForRanges, OptionsMaxThreadsBigPoolAutoChunkingNonBlocking) {
  function TEST (line 282) | TEST(GreedyForRanges, OptionsMaxThreadsSmallPoolStaticChunkingBlocking) {
  function TEST (line 288) | TEST(GreedyForRanges, OptionsMaxThreadsSmallPoolStaticChunkingNonBlockin...
  function TEST (line 294) | TEST(GreedyForRanges, OptionsMaxThreadsSmallPoolAutoChunkingBlocking) {
  function TEST (line 300) | TEST(GreedyForRanges, OptionsMaxThreadsSmallPoolAutoChunkingNonBlocking) {
  function TEST (line 306) | TEST(GreedyForRanges, OptionsMaxThreadsSerialStaticChunkingBlocking) {
  function TEST (line 312) | TEST(GreedyForRanges, OptionsMaxThreadsSerialStaticChunkingNonBlocking) {
  function TEST (line 318) | TEST(GreedyForRanges, OptionsMaxThreadsSerialAutoChunkingBlocking) {
  function TEST (line 324) | TEST(GreedyForRanges, OptionsMaxThreadsSerialAutoChunkingNonBlocking) {
  function TEST (line 330) | TEST(GreedyForRanges, NegativeRangeLength) {
  function TEST (line 337) | TEST(GreedyForRanges, NegativeRangeLengthBig) {
  function TEST (line 344) | TEST(GreedyForRanges, ZeroLength2) {
  function TEST (line 351) | TEST(GreedyForRanges, AvoidOverflow1) {
  function TEST (line 365) | TEST(GreedyForRanges, AvoidOverflow2) {
  function TEST (line 391) | TEST(GreedyForRanges, EmptyLoopsWaitIfToldTo) {
  function TEST (line 421) | TEST(GreedyForRanges, SingleLoopWaitIfToldTo) {
  function TEST (line 446) | TEST(GreedyForRanges, ZeroThreads) {
  function TEST (line 466) | TEST(GreedyForRanges, ZeroThreadsWithState) {

FILE: tests/greedy_for_test.cpp
  function simpleInner (line 17) | static void simpleInner(int w, int y, const std::vector<int>& image, std...
  function TEST (line 26) | TEST(GreedyFor, SimpleLoop) {
  function TEST (line 38) | TEST(GreedyFor, ShouldNotInvokeIfEmptyRange) {
  function loopWithStateImpl (line 52) | void loopWithStateImpl(dispenso::ThreadPool& pool = dispenso::globalThre...
  function TEST (line 82) | TEST(GreedyFor, LoopWithDequeState) {
  function TEST (line 85) | TEST(GreedyFor, LoopWithVectorState) {
  function TEST (line 88) | TEST(GreedyFor, LoopWithListState) {
  function TEST (line 92) | TEST(GreedyFor, ConcurrentLoopNoCoordination) {
  function TEST (line 114) | TEST(GreedyFor, CoordinatedLoops) {
  function concurrentLoop (line 137) | static void concurrentLoop(
  function TEST (line 149) | TEST(GreedyFor, CoordinatedConcurrentLoops) {
  function testMaxThreads (line 173) | static void
  function TEST (line 215) | TEST(GreedyFor, OptionsMaxThreadsBigPoolStaticChunkingBlocking) {
  function TEST (line 221) | TEST(GreedyFor, OptionsMaxThreadsBigPoolStaticChunkingNonBlocking) {
  function TEST (line 227) | TEST(GreedyFor, OptionsMaxThreadsBigPoolAutoChunkingBlocking) {
  function TEST (line 233) | TEST(GreedyFor, OptionsMaxThreadsBigPoolAutoChunkingNonBlocking) {
  function TEST (line 239) | TEST(GreedyFor, OptionsMaxThreadsSmallPoolStaticChunkingBlocking) {
  function TEST (line 245) | TEST(GreedyFor, OptionsMaxThreadsSmallPoolStaticChunkingNonBlocking) {
  function TEST (line 251) | TEST(GreedyFor, OptionsMaxThreadsSmallPoolAutoChunkingBlocking) {
  function TEST (line 257) | TEST(GreedyFor, OptionsMaxThreadsSmallPoolAutoChunkingNonBlocking) {
  function TEST (line 263) | TEST(GreedyFor, OptionsMaxThreadsSerialStaticChunkingBlocking) {
  function TEST (line 269) | TEST(GreedyFor, OptionsMaxThreadsSerialStaticChunkingNonBlocking) {
  function TEST (line 275) | TEST(GreedyFor, OptionsMaxThreadsSerialAutoChunkingBlocking) {
  function TEST (line 281) | TEST(GreedyFor, OptionsMaxThreadsSerialAutoChunkingNonBlocking) {
  function TEST (line 287) | TEST(GreedyFor, NegativeRangeLength) {
  function TEST (line 294) | TEST(GreedyFor, NegativeRangeLengthBig) {
  function TEST (line 301) | TEST(GreedyFor, ZeroLength2) {
  function TEST (line 308) | TEST(GreedyFor, AvoidOverflow1) {
  function TEST (line 320) | TEST(GreedyFor, AvoidOverflow2) {
  function TEST (line 346) | TEST(GreedyFor, EmptyLoopsWaitIfToldTo) {
  function TEST (line 374) | TEST(GreedyFor, SingleLoopWaitIfToldTo) {
  function TEST (line 397) | TEST(GreedyFor, ZeroThreads) {
  function TEST (line 413) | TEST(GreedyFor, ZeroThreadsWithState) {
  function TEST (line 419) | TEST(GreedyFor, SimpleLoopFewerItemsThanThreads) {

FILE: tests/latch_test.cpp
  function TEST (line 17) | TEST(Latch, ArriveAndWait) {
  function TEST (line 47) | TEST(Latch, CountDown) {
  function TEST (line 82) | TEST(Latch, ArriveAndWaitWithCountDown) {

FILE: tests/once_function_test.cpp
  function TEST (line 22) | TEST(OnceFunction, Empty) {
  function TEST (line 27) | TEST(OnceFunction, MoveConstructor) {
  function TEST (line 33) | TEST(OnceFunction, MoveOperator) {
  function testSize (line 41) | void testSize() {
  type Foo (line 72) | struct Foo {
  function TEST (line 89) | TEST(OnceFunction, ExtraSmall) {
  function TEST (line 93) | TEST(OnceFunction, Small) {
  function TEST (line 97) | TEST(OnceFunction, Medium) {
  function TEST (line 101) | TEST(OnceFunction, Large) {
  function TEST (line 105) | TEST(OnceFunction, ExtraLarge) {
  function TEST (line 109) | TEST(OnceFunction, MoveWithResult) {
  function ensureDestructor (line 120) | void ensureDestructor() {
  function TEST (line 140) | TEST(OnceFunction, EnsureDestructionExtraSmall) {
  function TEST (line 144) | TEST(OnceFunction, EnsureDestructionSmall) {
  function TEST (line 148) | TEST(OnceFunction, EnsureDestructionMedium) {
  function TEST (line 152) | TEST(OnceFunction, EnsureDestructionLarge) {
  function TEST (line 156) | TEST(OnceFunction, EnsureDestructionExtraLarge) {
  type EnsureAlign (line 161) | struct EnsureAlign {
  function TEST (line 170) | TEST(OnceFunction, EnsureAlignment1) {
  function TEST (line 176) | TEST(OnceFunction, EnsureAlignment2) {
  function TEST (line 181) | TEST(OnceFunction, EnsureAlignment4) {
  function TEST (line 186) | TEST(OnceFunction, EnsureAlignment8) {
  function TEST (line 191) | TEST(OnceFunction, EnsureAlignment16) {
  function TEST (line 196) | TEST(OnceFunction, EnsureAlignment32) {
  function TEST (line 201) | TEST(OnceFunction, EnsureAlignment64) {
  function TEST (line 206) | TEST(OnceFunction, EnsureAlignment128) {
  function TEST (line 211) | TEST(OnceFunction, EnsureAlignment256) {

FILE: tests/pipeline_test.cpp
  function TEST (line 22) | TEST(Pipeline, SingleStageSerial) {
  function TEST (line 35) | TEST(Pipeline, SingleStageSerialPassRefObject) {
  function TEST (line 50) | TEST(Pipeline, MultiStageSerial) {
  function TEST (line 83) | TEST(Pipeline, MultiStageSerialOpResult) {
  function TEST (line 114) | TEST(Pipeline, SingleStageParallel) {
  function TEST (line 132) | TEST(Pipeline, SingleStageParallelPassPrebuiltStage) {
  function TEST (line 151) | TEST(Pipeline, MultiStageGenIsParallel) {
  type Gen (line 187) | struct Gen {
    method Gen (line 188) | Gen(std::atomic<size_t>& c, std::vector<int>& i) : counter(c), inputs(...
  type Xform0 (line 201) | struct Xform0 {
  type Xform1 (line 212) | struct Xform1 {
  type Sink (line 219) | struct Sink {
  function TEST (line 225) | TEST(Pipeline, MultiStageCarryPointers) {
  function TEST (line 244) | TEST(Pipeline, MultiStageCarryPointers2) {
  function TEST (line 259) | TEST(Pipeline, MultiStageCarryPointersMultiFilterParallel) {
  function TEST (line 291) | TEST(Pipeline, MultiStageCarryPointersMultiFilterUnlimitedParallel) {
  function funkGen (line 325) | static TestOptional<size_t> funkGen() {
  function funkSink (line 333) | static void funkSink(size_t in) {
  function TEST (line 337) | TEST(Pipeline, PipelineFunctions) {
  function TEST (line 346) | TEST(Pipeline, PipelineMoveOnly) {
  function TEST (line 365) | TEST(Pipeline, PipelineMoveOnlyWithFiltering) {
  function TEST (line 387) | TEST(Pipeline, ZeroSizeThreadPool) {
  function TEST (line 398) | TEST(Pipeline, SerialStageStackBound) {
  function runExceptionSafetyTest (line 416) | void runExceptionSafetyTest(
  function TEST (line 440) | TEST(Pipeline, ExceptionSafety_SinkSerial) {
  function TEST (line 453) | TEST(Pipeline, ExceptionSafety_SinkParallel) {
  function TEST (line 466) | TEST(Pipeline, ExceptionSafety_SinkUnlimited) {
  function TEST (line 480) | TEST(Pipeline, ExceptionSafety_TransformSerial) {
  function TEST (line 494) | TEST(Pipeline, ExceptionSafety_TransformParallel) {
  function TEST (line 508) | TEST(Pipeline, ExceptionSafety_OpTransform) {
  function TEST (line 523) | TEST(Pipeline, ExceptionSafety_Generator) {
  function TEST (line 532) | TEST(Pipeline, ExceptionSafety_SingleStage) {
  function TEST (line 540) | TEST(Pipeline, ExceptionSafety_MultiStageMiddle) {
  function TEST (line 556) | TEST(Pipeline, ExceptionSafety_PlainFunctionSink) {

FILE: tests/pool_allocator_test.cpp
  function TEST (line 14) | TEST(PoolAllocator, SimpleMallocFree) {
  function TEST (line 24) | TEST(PoolAllocator, TrackAllocations) {
  function TEST (line 89) | TEST(PoolAllocator, SimpleThreaded) {
  function TEST (line 119) | TEST(PoolAllocator, Arena) {
  function TEST (line 155) | TEST(NoLockPoolAllocator, SimpleMallocFree) {
  function TEST (line 164) | TEST(NoLockPoolAllocator, MultipleAllocDealloc) {
  function TEST (line 189) | TEST(PoolAllocator, TotalChunkCapacity) {
  function TEST (line 225) | TEST(PoolAllocator, SingleChunkPerSlab) {
  function TEST (line 249) | TEST(PoolAllocator, ClearEmptyAllocator) {
  function TEST (line 264) | TEST(PoolAllocator, MultipleClearCycles) {
  function TEST (line 313) | TEST(PoolAllocator, ReuseAfterDealloc) {

FILE: tests/priority_test.cpp
  type ThreadInfo (line 30) | struct ThreadInfo {
    method error (line 35) | double error() const {
  function run (line 40) | void run(
  function TEST (line 93) | TEST(Priorty, PriorityGetsCycles) {

FILE: tests/resource_pool_test.cpp
  type Buffer (line 16) | struct Buffer {
    method Buffer (line 17) | Buffer(std::atomic_int& _total_count, std::atomic_int& _num_buffers)
  function BuffersTest (line 29) | void BuffersTest(const int num_threads, const int num_buffers) {
  function TEST (line 53) | TEST(ResourcePool, SameNumBuffersAsThreadsTest) {
  function TEST (line 59) | TEST(ResourcePool, FewerBuffersThanThreadsTest) {
  function TEST (line 65) | TEST(ResourcePool, MoreBuffersThanThreadsTest) {

FILE: tests/rw_lock_test.cpp
  function TEST (line 19) | TEST(RWLock, SimpleUncontested) {
  function TEST (line 30) | TEST(RWLock, BasicWriterTest) {
  function TEST (line 51) | TEST(RWLock, HighContentionReaderWriterTest) {
  function TEST (line 82) | TEST(RWLock, ReaderWriterTest) {
  function TEST (line 117) | TEST(RWLock, TestAlignment) {

FILE: tests/shared_pool_test.cpp
  function TEST (line 19) | TEST(ThreadPool, SharedPool) {

FILE: tests/small_buffer_allocator_test.cpp
  function testAllocDealloc (line 17) | void testAllocDealloc() {
  function TEST (line 29) | TEST(SmallBufferAllocator, AllocDeallocVariousSizes) {
  function testBytesAllocated (line 48) | void testBytesAllocated() {
  function testBytesAllocated (line 58) | void testBytesAllocated() {
  function TEST (line 64) | TEST(SmallBufferAllocator, BytesAllocatedVariousSizes) {
  function testMultipleAllocs (line 76) | void testMultipleAllocs() {
  function TEST (line 99) | TEST(SmallBufferAllocator, MultipleAllocsSmallSize) {
  function TEST (line 103) | TEST(SmallBufferAllocator, MultipleAllocsMediumSize) {
  function TEST (line 107) | TEST(SmallBufferAllocator, MultipleAllocsLargeSize) {
  function TEST (line 111) | TEST(SmallBufferAllocator, MultipleAllocsOverMaxSize) {
  function TEST (line 116) | TEST(SmallBufferAllocator, AllocationReuse) {
  function TEST (line 128) | TEST(SmallBufferAllocator, ThreadedAllocDealloc) {

FILE: tests/small_vector_test.cpp
  type Tracked (line 18) | struct Tracked {
    method reset (line 24) | static void reset() {
    method Tracked (line 33) | Tracked() : value(0) {
    method Tracked (line 36) | explicit Tracked(int v) : value(v) {
    method Tracked (line 39) | Tracked(const Tracked& other) : value(other.value) {
    method Tracked (line 43) | Tracked(Tracked&& other) noexcept : value(other.value) {
    method Tracked (line 51) | Tracked& operator=(const Tracked& other) {
    method Tracked (line 55) | Tracked& operator=(Tracked&& other) noexcept {
  function TEST (line 69) | TEST(SmallVector, DefaultConstruction) {
  function TEST (line 76) | TEST(SmallVector, ConstructionWithSize) {
  function TEST (line 84) | TEST(SmallVector, ConstructionWithSizeAndValue) {
  function TEST (line 92) | TEST(SmallVector, InitializerListConstruction) {
  function TEST (line 99) | TEST(SmallVector, CopyConstruction) {
  function TEST (line 111) | TEST(SmallVector, MoveConstructionInline) {
  function TEST (line 121) | TEST(SmallVector, MoveConstructionHeap) {
  function TEST (line 134) | TEST(SmallVector, CopyAssignment) {
  function TEST (line 144) | TEST(SmallVector, MoveAssignment) {
  function TEST (line 153) | TEST(SmallVector, MoveAssignmentHeapToHeap) {
  function TEST (line 163) | TEST(SmallVector, MoveAssignmentInlineToHeap) {
  function TEST (line 174) | TEST(SmallVector, SelfAssignment) {
  function TEST (line 190) | TEST(SmallVector, OperatorBracket) {
  function TEST (line 199) | TEST(SmallVector, FrontAndBack) {
  function TEST (line 209) | TEST(SmallVector, Data) {
  function TEST (line 219) | TEST(SmallVector, Empty) {
  function TEST (line 226) | TEST(SmallVector, Size) {
  function TEST (line 237) | TEST(SmallVector, PushBack) {
  function TEST (line 248) | TEST(SmallVector, PushBackGrowth) {
  function TEST (line 259) | TEST(SmallVector, EmplaceBack) {
  function TEST (line 270) | TEST(SmallVector, PopBack) {
  function TEST (line 280) | TEST(SmallVector, Clear) {
  function TEST (line 287) | TEST(SmallVector, ResizeGrow) {
  function TEST (line 298) | TEST(SmallVector, ResizeShrink) {
  function TEST (line 306) | TEST(SmallVector, ResizeWithValue) {
  function TEST (line 317) | TEST(SmallVector, EraseMiddle) {
  function TEST (line 328) | TEST(SmallVector, EraseFirst) {
  function TEST (line 336) | TEST(SmallVector, EraseLast) {
  function TEST (line 346) | TEST(SmallVector, RangeBasedFor) {
  function TEST (line 355) | TEST(SmallVector, ConstIteration) {
  function TEST (line 364) | TEST(SmallVector, BeginEnd) {
  function TEST (line 371) | TEST(SmallVector, CBeginCEnd) {
  function TEST (line 378) | TEST(SmallVector, ConstAccessors) {
  function TEST (line 391) | TEST(SmallVector, DestructorsCalled) {
  function TEST (line 404) | TEST(SmallVector, DestructorsCalledOnClear) {
  function TEST (line 414) | TEST(SmallVector, DestructorsCalledOnGrowth) {
  function TEST (line 425) | TEST(SmallVector, MoveSemantics) {
  function TEST (line 442) | TEST(SmallVector, StringOperations) {
  function TEST (line 453) | TEST(SmallVector, StringGrowth) {
  function TEST (line 466) | TEST(SmallVector, LargeInlineCapacity) {
  function TEST (line 478) | TEST(SmallVector, DifferentInlineCapacities) {
  function TEST (line 487) | TEST(SmallVector, HeapRegrowth) {
  function TEST (line 500) | TEST(SmallVector, HeapRegrowthWithReserve) {
  function TEST (line 513) | TEST(SmallVector, HeapRegrowthTracked) {
  function TEST (line 528) | TEST(SmallVector, EraseOnHeap) {
  function TEST (line 543) | TEST(SmallVector, PopBackOnHeap) {

FILE: tests/spsc_ring_buffer_test.cpp
  function TEST (line 25) | TEST(SPSCRingBuffer, DefaultConstructionIsEmpty) {
  function TEST (line 32) | TEST(SPSCRingBuffer, CapacityIsCorrect) {
  function TEST (line 58) | TEST(SPSCRingBuffer, PushAndPopSingleElement) {
  function TEST (line 72) | TEST(SPSCRingBuffer, PushMoveSemantics) {
  function TEST (line 85) | TEST(SPSCRingBuffer, PushCopySemantics) {
  function TEST (line 98) | TEST(SPSCRingBuffer, TryEmplace) {
  function TEST (line 110) | TEST(SPSCRingBuffer, PopFromEmpty) {
  function TEST (line 119) | TEST(SPSCRingBuffer, TryPopReturnsOpResult) {
  function TEST (line 136) | TEST(SPSCRingBuffer, TryPopOpResultWithMoveOnlyType) {
  function TEST (line 149) | TEST(SPSCRingBuffer, TryPopOpResultWithString) {
  function TEST (line 158) | TEST(SPSCRingBuffer, FIFOOrder) {
  function TEST (line 176) | TEST(SPSCRingBuffer, FillToCapacity) {
  function TEST (line 194) | TEST(SPSCRingBuffer, FullThenPop) {
  function TEST (line 215) | TEST(SPSCRingBuffer, CapacityOne) {
  function TEST (line 233) | TEST(SPSCRingBuffer, SizeTracking) {
  function TEST (line 254) | TEST(SPSCRingBuffer, WrapAround) {
  function TEST (line 288) | TEST(SPSCRingBuffer, MultipleWrapArounds) {
  function TEST (line 305) | TEST(SPSCRingBuffer, SizeWithWrapAround) {
  function TEST (line 330) | TEST(SPSCRingBuffer, MoveOnlyType) {
  function TEST (line 343) | TEST(SPSCRingBuffer, MoveOnlyTypeMultiple) {
  type NonTrivial (line 362) | struct NonTrivial {
    method NonTrivial (line 369) | NonTrivial() : value(0) {
    method NonTrivial (line 372) | explicit NonTrivial(int v) : value(v) {
    method NonTrivial (line 375) | NonTrivial(const NonTrivial& other) : value(other.value) {
    method NonTrivial (line 378) | NonTrivial(NonTrivial&& other) noexcept : value(other.value) {
    method NonTrivial (line 382) | NonTrivial& operator=(const NonTrivial& other) {
    method NonTrivial (line 386) | NonTrivial& operator=(NonTrivial&& other) noexcept {
    method resetCounters (line 396) | static void resetCounters() {
  function TEST (line 408) | TEST(SPSCRingBuffer, NonTrivialType) {
  function TEST (line 426) | TEST(SPSCRingBuffer, NonTrivialDestructorOnBufferDestruction) {
  function TEST (line 445) | TEST(SPSCRingBuffer, ConcurrentProducerConsumer) {
  function TEST (line 488) | TEST(SPSCRingBuffer, ConcurrentWithSmallBuffer) {
  function TEST (line 528) | TEST(SPSCRingBuffer, ConcurrentWithMoveOnlyType) {
  function TEST (line 573) | TEST(SPSCRingBuffer, EmptyAfterDraining) {
  function TEST (line 588) | TEST(SPSCRingBuffer, RepeatedFillAndDrain) {
  function TEST (line 612) | TEST(SPSCRingBuffer, AlternatingPushPop) {
  function TEST (line 624) | TEST(SPSCRingBuffer, LargeElements) {
  function TEST (line 655) | TEST(SPSCRingBuffer, StringElements) {
  function TEST (line 676) | TEST(SPSCRingBuffer, TryPopIntoBasic) {
  function TEST (line 694) | TEST(SPSCRingBuffer, TryPopIntoMoveOnly) {
  function TEST (line 715) | TEST(SPSCRingBuffer, TryPopIntoNonTrivial) {
  type NonDefaultConstructible (line 747) | struct NonDefaultConstructible {
    method NonDefaultConstructible (line 751) | explicit NonDefaultConstructible(int v) : value(v) {}
    method NonDefaultConstructible (line 752) | NonDefaultConstructible(const NonDefaultConstructible&) = default;
    method NonDefaultConstructible (line 753) | NonDefaultConstructible(NonDefaultConstructible&&) = default;
    method NonDefaultConstructible (line 754) | NonDefaultConstructible& operator=(const NonDefaultConstructible&) = d...
    method NonDefaultConstructible (line 755) | NonDefaultConstructible& operator=(NonDefaultConstructible&&) = default;
  function TEST (line 759) | TEST(SPSCRingBuffer, NonDefaultConstructibleType) {
  function TEST (line 780) | TEST(SPSCRingBuffer, NonDefaultConstructibleWithMove) {
  function TEST (line 797) | TEST(SPSCRingBuffer, LargeCapacity) {
  function TEST (line 820) | TEST(SPSCRingBuffer, LargeCapacityConcurrent) {
  function TEST (line 863) | TEST(SPSCRingBuffer, TryPushBatchBasic) {
  function TEST (line 880) | TEST(SPSCRingBuffer, TryPushBatchPartial) {
  function TEST (line 899) | TEST(SPSCRingBuffer, TryPushBatchEmpty) {
  function TEST (line 909) | TEST(SPSCRingBuffer, TryPushBatchWhenFull) {
  function TEST (line 925) | TEST(SPSCRingBuffer, TryPopBatchBasic) {
  function TEST (line 943) | TEST(SPSCRingBuffer, TryPopBatchPartial) {
  function TEST (line 965) | TEST(SPSCRingBuffer, TryPopBatchEmpty) {
  function TEST (line 978) | TEST(SPSCRingBuffer, BatchWithWrapAround) {
  function TEST (line 1010) | TEST(SPSCRingBuffer, BatchConcurrent) {
  function TEST (line 1069) | TEST(SPSCRingBuffer, StressTest) {

FILE: tests/task_set_test.cpp
  type ScheduleType (line 21) | enum ScheduleType { kDefault, kForceQueue, kMixed }
  class TaskSetTest (line 23) | class TaskSetTest : public ::testing::TestWithParam<ScheduleType> {
    method SetUp (line 25) | void SetUp() override {}
    method TearDown (line 26) | void TearDown() override {}
    method schedule (line 29) | void schedule(TaskSetT& taskSet, F&& f) {
  function TEST_P (line 56) | TEST_P(TaskSetTest, MixedWork) {
  function TEST_P (line 77) | TEST_P(TaskSetTest, MultiWait) {
  function TEST_P (line 111) | TEST_P(TaskSetTest, MultiSet) {
  function TEST_P (line 148) | TEST_P(TaskSetTest, MultiSetTryWait) {
  function TEST (line 188) | TEST(TaskSetTest, ParamConstruction) {
  function TEST (line 196) | TEST(ConcurrentTaskSetTest, ParamConstruction) {
  function recursiveFunc (line 204) | static void recursiveFunc(dispenso::ThreadPool& pool, int num) {
  function TEST (line 219) | TEST(TaskSet, Recursive) {
  type Node (line 224) | struct Node {
  function buildTree (line 229) | static void buildTree(dispenso::ConcurrentTaskSet& tasks, std::unique_pt...
  function verifyTree (line 238) | static void verifyTree(const std::unique_ptr<Node>& node, int depthRemai...
  function TEST (line 246) | TEST(ConcurrentTaskSet, DoTree) {
  function TEST (line 255) | TEST(TaskSet, OneChildCancels) {
  function TEST (line 270) | TEST(TaskSet, ParentThreadCancels) {
  function TEST (line 285) | TEST(TaskSet, CascadingCancelOne) {
  function TEST (line 303) | TEST(TaskSet, CascadingOne) {
  function TEST (line 320) | TEST(TaskSet, CascadingManyCancel) {
  function TEST (line 349) | TEST(TaskSet, CascadingMany) {
  function TEST (line 378) | TEST(TaskSet, Exception) {
  function TEST (line 397) | TEST(ConcurrentTaskSet, Exception) {
  function TEST (line 416) | TEST(TaskSet, ExceptionNoForceQueuing) {
  function TEST (line 435) | TEST(ConcurrentTaskSet, ExceptionNoForceQueuing) {
  function TEST (line 454) | TEST(TaskSet, ExceptionTryWait) {
  function TEST (line 474) | TEST(ConcurrentTaskSet, ExceptionTryWait) {
  function TEST (line 494) | TEST(TaskSet, ExceptionNoForceQueuingTryWait) {
  function TEST (line 514) | TEST(ConcurrentTaskSet, ExceptionNoForceQueuingTryWait) {
  function TEST (line 534) | TEST(TaskSet, ExceptionCancels) {
  function TEST (line 562) | TEST(TaskSet, EmptyTaskSet) {
  function TEST (line 577) | TEST(TaskSet, TryWaitOnEmpty) {
  function TEST (line 587) | TEST(TaskSet, CanceledStateBeforeCancellation) {
  function TEST (line 606) | TEST(TaskSet, SingleTask) {
  function TEST (line 617) | TEST(TaskSet, GlobalThreadPool) {
  function TEST (line 631) | TEST(ConcurrentTaskSet, EmptyTaskSet) {
  function TEST (line 639) | TEST(ConcurrentTaskSet, TryWaitOnEmpty) {
  function TEST (line 647) | TEST(ConcurrentTaskSet, ConcurrentScheduling) {
  function TEST (line 677) | TEST(TaskSet, LargeBatchOfTasks) {
  function TEST (line 695) | TEST(ConcurrentTaskSet, CanceledState) {
  function TEST (line 717) | TEST(TaskSet, ScheduleBulkBasic) {
  function TEST (line 733) | TEST(TaskSet, ScheduleBulkEdgeCases) {
  function TEST (line 771) | TEST(TaskSet, ScheduleBulkLarge) {
  function TEST (line 787) | TEST(TaskSet, ScheduleBulkMultipleWaits) {
  function TEST (line 811) | TEST(TaskSet, ScheduleBulkMixedWithSchedule) {
  function TEST (line 841) | TEST(TaskSet, ScheduleBulkCancellation) {
  function TEST (line 868) | TEST(TaskSet, ScheduleBulkException) {
  function TEST (line 891) | TEST(ConcurrentTaskSet, ScheduleBulkBasic) {
  function TEST (line 907) | TEST(ConcurrentTaskSet, ScheduleBulkEdgeCases) {
  function TEST (line 943) | TEST(ConcurrentTaskSet, ScheduleBulkConcurrent) {
  function TEST (line 967) | TEST(ConcurrentTaskSet, ScheduleBulkMixedWithSchedule) {
  function TEST (line 1001) | TEST(ConcurrentTaskSet, ScheduleBulkCancellation) {

FILE: tests/test_tid.h
  function resetTestTid (line 21) | inline void resetTestTid() {
  function getTestTid (line 26) | inline int getTestTid() {

FILE: tests/thread_id_test.cpp
  function TEST (line 16) | TEST(ThreadId, Repeatable) {
  function TEST (line 38) | TEST(ThreadId, Unique) {

FILE: tests/thread_pool_test.cpp
  function TEST (line 25) | TEST(ThreadPool, SimpleCreationDestruction) {
  function TEST (line 30) | TEST(ThreadPool, Resize) {
  type ScheduleType (line 41) | enum ScheduleType { kDefault, kForceQueue, kMixed }
  class ThreadPoolTest (line 43) | class ThreadPoolTest : public testing::TestWithParam<ScheduleType> {
    method initPool (line 45) | void initPool(size_t threads) {
    method schedule (line 50) | void schedule(F&& f) {
    method destroyPool (line 68) | void destroyPool() {
  function TEST_P (line 82) | TEST_P(ThreadPoolTest, SimpleWork) {
  function TEST_P (line 102) | TEST_P(ThreadPoolTest, MixedWork) {
  function TEST (line 123) | TEST(ThreadPool, ResizeConcurrent) {
  function TEST (line 150) | TEST(ThreadPool, ResizeMoreConcurrent) {
  function TEST (line 186) | TEST(ThreadPool, SetSignalingWakeConcurrent) {
  function TEST (line 223) | TEST(ThreadPool, ResizeCheckApproxActualRunningThreads) {
  function TEST_P (line 266) | TEST_P(ThreadPoolTest, CrossPoolTest) {
  function TEST (line 287) | TEST(ThreadPool, SetSignalingWakeConcurrentZeroLatency) {
  function TEST_P (line 324) | TEST_P(ThreadPoolTest, SimpleWorkZeroLatencyPoll) {
  function TEST (line 373) | TEST(ThreadPool, SpinPollWithSleep) {
  function TEST (line 397) | TEST(ThreadPool, SpinPollWithSleepForceQueue) {
  function TEST (line 420) | TEST(ThreadPool, SpinPollWithSleepResizeConcurrent) {
  function TEST (line 454) | TEST(ThreadPool, TransitionBetweenModes) {
  function TEST (line 487) | TEST(ThreadPool, SpinPollWithDifferentSleepDurations) {
  function TEST (line 510) | TEST(ThreadPool, SingleThreadSpinPoll) {
  function TEST (line 529) | TEST(ThreadPool, ScheduleBulkBasic) {
  function TEST (line 543) | TEST(ThreadPool, ScheduleBulkZero) {
  function TEST (line 554) | TEST(ThreadPool, ScheduleBulkOne) {
  function TEST (line 565) | TEST(ThreadPool, ScheduleBulkTwo) {
  function TEST (line 577) | TEST(ThreadPool, ScheduleBulkLarge) {
  function TEST (line 591) | TEST(ThreadPool, ScheduleBulkMixedWithSchedule) {
  function TEST (line 620) | TEST(ThreadPool, ScheduleBulkConcurrent) {
  function TEST (line 641) | TEST(ThreadPool, RepeatedCreationDestruction) {

FILE: tests/timed_task_test.cpp
  function errAdjust (line 29) | double errAdjust(double eps) {
  function TEST (line 57) | TEST(TimedTaskTest, RunLikelyZeroTimes) {
  function TEST (line 84) | TEST(TimedTaskTest, MoveAndRunLikelyZeroTimes) {
  function TEST (line 113) | TEST(TimedTaskTest, RunOnce) {
  function TEST (line 142) | TEST(TimedTaskTest, RunPeriodic) {
  function TEST (line 185) | TEST(TimedTaskTest, RunPeriodicSteady) {
  function TEST (line 230) | TEST(TimedTaskTest, RunPeriodicDontWait) {
  function TEST (line 277) | TEST(TimedTaskTest, RunPeriodicSteadyUnderLoad) {
  function TEST (line 340) | TEST(TimedTaskTest, RunDetach) {
  function TEST (line 382) | TEST(TimedTaskTest, RunChronoDelayByDuration) {
  function TEST (line 410) | TEST(TimedTaskTest, RunChronoDelayToTimePoint) {
  function TEST (line 438) | TEST(TimedTaskTest, RunChronoDelayByDurationWithPeriod) {
  function TEST (line 478) | TEST(TimedTaskTest, RunChronoDelayByDurationWithPeriodSteady) {
  function TEST (line 518) | TEST(TimedTaskTest, RunChronoDelayToTimepointWithPeriod) {
  function TEST (line 558) | TEST(TimedTaskTest, RunChronoDelayToTimepointWithPeriodSteady) {
  function TEST (line 600) | TEST(TimedTaskTest, RunOnceImmediatelyLongPeriod) {
  function TEST (line 631) | TEST(TimedTaskTest, CancelViaProvidedFunction) {
  function TEST (line 664) | TEST(TimedTaskTest, CancelViaProvidedFunctionInThreadPool) {
  function TEST (line 697) | TEST(TimedTaskTest, FunctionDestructionPostTaskDestruct) {

FILE: tests/timing_test.cpp
  function TEST (line 22) | TEST(Timing, GetTimeReturnsNonNegative) {
  function TEST (line 27) | TEST(Timing, GetTimeIsMonotonic) {
  function TEST (line 36) | TEST(Timing, GetTimeProgresses) {
  function TEST (line 46) | TEST(Timing, StatisticalAccuracy) {
  function TEST (line 87) | TEST(Timing, LongerDurationAccuracy) {
  function TEST (line 121) | TEST(Timing, RapidCalls) {

FILE: tests/util_test.cpp
  type RefCounted (line 24) | struct RefCounted {
    method RefCounted (line 25) | RefCounted() {
  type NonTrivial (line 35) | struct NonTrivial {
    method NonTrivial (line 38) | NonTrivial(int v) : value(v) {
    method NonTrivial (line 42) | NonTrivial(const NonTrivial& other) : value(other.value) {
    method NonTrivial (line 46) | NonTrivial(NonTrivial&& other) : value(other.value) {
  function TEST (line 56) | TEST(Util, AlignedMallocAndFree) {
  function TEST (line 83) | TEST(Util, AlignedMallocUsable) {
  function TEST (line 102) | TEST(Util, AlignedDeleter) {
  function TEST (line 136) | TEST(Util, AlignToCacheLine) {
  function TEST (line 149) | TEST(Util, CpuRelax) {
  function TEST (line 156) | TEST(Util, NextPow2) {
  function TEST (line 178) | TEST(Util, Log2Const) {
  function TEST (line 199) | TEST(Util, Log2) {
  function TEST (line 221) | TEST(Util, AlignedBuffer) {
  function TEST (line 238) | TEST(Util, AlignedAtomic) {
  function TEST (line 274) | TEST(Util, OpResult) {

Download .json

Condensed preview — 268 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (6,026K chars).

[
  {
    "path": ".clang-format",
    "chars": 2617,
    "preview": "---\nAccessModifierOffset: -1\nAlignAfterOpenBracket: AlwaysBreak\nAlignConsecutiveAssignments: false\nAlignConsecutiveDecla"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/bug_report.yml",
    "chars": 1260,
    "preview": "name: Bug Report\ndescription: File a bug report\ntitle: \"[Bug]: \"\nlabels: [\"bug\", \"triage\"]\nassignees:\n  - graphicsMan\nbo"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/config.yml",
    "chars": 28,
    "preview": "blank_issues_enabled: false\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/feature_request.yml",
    "chars": 996,
    "preview": "name: Feature Request\ndescription: File a feature request\ntitle: \"[Feature Request]: \"\nlabels: [\"feature\", \"request\"]\nas"
  },
  {
    "path": ".github/PULL_REQUEST_TEMPLATE.md",
    "chars": 1509,
    "preview": "# PR Details\n\n<!--- Provide a general summary of your changes in the Title above -->\n\n## Description\n\n<!--- Describe you"
  },
  {
    "path": ".github/workflows/build.yml",
    "chars": 7737,
    "preview": "name: Build and test\n\non:\n  push:\n    branches: [main]\n  pull_request:\n    branches: [main]\n\npermissions:\n  contents: re"
  },
  {
    "path": ".github/workflows/codeql.yml",
    "chars": 743,
    "preview": "name: CodeQL\n\non:\n  push:\n    branches: [main]\n  pull_request:\n    branches: [main]\n  schedule:\n    - cron: '17 9 * * 1'"
  },
  {
    "path": ".github/workflows/docs.yml",
    "chars": 887,
    "preview": "name: Docs\n\non:\n  push:\n    branches: [ main ]\n\npermissions:\n  contents: write\n\njobs:\n  build:\n    runs-on: ubuntu-lates"
  },
  {
    "path": ".gitignore",
    "chars": 447,
    "preview": "# Compiled source #\n###################\n*.com\n*.class\n*.dll\n*.exe\n*.o\n*.so\n*.a\nbin\nlib\n\n# Packages #\n############\n# it's"
  },
  {
    "path": "CHANGELOG.md",
    "chars": 16364,
    "preview": "1.5.1 (March 28, 2026)\n\n### Bug fixes\n* Fixed `__ulock_wait`/`__ulock_wake` usage on macOS versions prior to 10.12 and o"
  },
  {
    "path": "CMakeLists.txt",
    "chars": 5405,
    "preview": "# Copyright (c) Meta Platforms, Inc. and affiliates.\n#\n# This source code is licensed under the MIT license found in the"
  },
  {
    "path": "CMakePresets.json",
    "chars": 2690,
    "preview": "{\n  \"version\": 6,\n  \"cmakeMinimumRequired\": {\n    \"major\": 3,\n    \"minor\": 21,\n    \"patch\": 0\n  },\n  \"configurePresets\":"
  },
  {
    "path": "CODE_OF_CONDUCT.md",
    "chars": 3355,
    "preview": "# Code of Conduct\n\n## Our Pledge\n\nIn the interest of fostering an open and welcoming environment, we as\ncontributors and"
  },
  {
    "path": "CONTRIBUTING.md",
    "chars": 2553,
    "preview": "# Contributing to dispenso\nWe want to make contributing to this project as easy and transparent as\npossible.  There is a"
  },
  {
    "path": "LICENSE",
    "chars": 1086,
    "preview": "MIT License\n\nCopyright (c) Facebook, Inc. and its affiliates.\n\nPermission is hereby granted, free of charge, to any pers"
  },
  {
    "path": "README.md",
    "chars": 14286,
    "preview": "[![Build and test](https://github.com/facebookincubator/dispenso/actions/workflows/build.yml/badge.svg)](https://github."
  },
  {
    "path": "benchmarks/CMakeLists.txt",
    "chars": 4413,
    "preview": "# Copyright (c) Meta Platforms, Inc. and affiliates.\n#\n# This source code is licensed under the MIT license found in the"
  },
  {
    "path": "benchmarks/benchmark_common.h",
    "chars": 490,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "benchmarks/cascading_parallel_for_benchmark.cpp",
    "chars": 11516,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "benchmarks/concurrent_vector_benchmark.cpp",
    "chars": 21878,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "benchmarks/fast_math/CMakeLists.txt",
    "chars": 1264,
    "preview": "# Copyright (c) Meta Platforms, Inc. and affiliates.\n#\n# This source code is licensed under the MIT license found in the"
  },
  {
    "path": "benchmarks/fast_math/avx512_benchmarks.cpp",
    "chars": 6551,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "benchmarks/fast_math/avx_benchmarks.cpp",
    "chars": 7128,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "benchmarks/fast_math/benchmark_helpers.h",
    "chars": 8331,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "benchmarks/fast_math/benchmarks.cpp",
    "chars": 17425,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "benchmarks/fast_math/erf_benchmarks.cpp",
    "chars": 20853,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "benchmarks/fast_math/hwy_benchmarks.cpp",
    "chars": 8158,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "benchmarks/fast_math/neon_benchmarks.cpp",
    "chars": 5765,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "benchmarks/fast_math/sse_benchmarks.cpp",
    "chars": 7830,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "benchmarks/for_each_benchmark.cpp",
    "chars": 5426,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "benchmarks/for_latency_benchmark.cpp",
    "chars": 4954,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "benchmarks/future_benchmark.cpp",
    "chars": 12633,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "benchmarks/graph_benchmark.cpp",
    "chars": 9821,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "benchmarks/graph_scene_benchmark.cpp",
    "chars": 17691,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "benchmarks/idle_pool_benchmark.cpp",
    "chars": 4300,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "benchmarks/locality_benchmark.cpp",
    "chars": 7086,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "benchmarks/nested_for_benchmark.cpp",
    "chars": 7896,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "benchmarks/nested_pool_benchmark.cpp",
    "chars": 5249,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "benchmarks/once_function_benchmark.cpp",
    "chars": 3881,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "benchmarks/pipeline_benchmark.cpp",
    "chars": 10603,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "benchmarks/pool_allocator_benchmark.cpp",
    "chars": 6135,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "benchmarks/run_benchmarks.py",
    "chars": 14071,
    "preview": "#!/usr/bin/env python3\n# Copyright (c) Meta Platforms, Inc. and affiliates.\n#\n# This source code is licensed under the M"
  },
  {
    "path": "benchmarks/rw_lock_benchmark.cpp",
    "chars": 3054,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "benchmarks/simple_for_benchmark.cpp",
    "chars": 6841,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "benchmarks/simple_pool_benchmark.cpp",
    "chars": 3626,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "benchmarks/small_buffer_benchmark.cpp",
    "chars": 2090,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "benchmarks/summing_for_benchmark.cpp",
    "chars": 5932,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "benchmarks/tbb_compat.h",
    "chars": 2303,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "benchmarks/thread_benchmark_common.h",
    "chars": 2256,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "benchmarks/timed_task_benchmark.cpp",
    "chars": 8992,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "benchmarks/trivial_compute_benchmark.cpp",
    "chars": 5823,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "cmake/DispensoConfig.cmake.in",
    "chars": 460,
    "preview": "# \n# Copyright (c) Meta Platforms, Inc. and affiliates.\n# \n# This source code is licensed under the MIT license found in"
  },
  {
    "path": "codecov.yml",
    "chars": 707,
    "preview": "# Codecov configuration for dispenso\n# See https://docs.codecov.com/docs/codecovyml-reference\n\ncoverage:\n  # Overall pro"
  },
  {
    "path": "dispenso/CMakeLists.txt",
    "chars": 4640,
    "preview": "# Copyright (c) Meta Platforms, Inc. and affiliates.\n#\n# This source code is licensed under the MIT license found in the"
  },
  {
    "path": "dispenso/async_request.h",
    "chars": 3597,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/completion_event.h",
    "chars": 2487,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/concurrent_object_arena.h",
    "chars": 11726,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/concurrent_vector.h",
    "chars": 45093,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/detail/can_invoke.h",
    "chars": 605,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/detail/completion_event_impl.h",
    "chars": 11748,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/detail/concurrent_vector_impl.h",
    "chars": 16445,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/detail/concurrent_vector_impl2.h",
    "chars": 4798,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/detail/epoch_waiter.h",
    "chars": 7930,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/detail/future_impl.h",
    "chars": 17970,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/detail/future_impl2.h",
    "chars": 9116,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/detail/graph_executor_impl.h",
    "chars": 4213,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/detail/math.h",
    "chars": 3676,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/detail/notifier_common.h",
    "chars": 4346,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/detail/once_callable_impl.h",
    "chars": 1408,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/detail/op_result.h",
    "chars": 1814,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/detail/per_thread_info.cpp",
    "chars": 466,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/detail/per_thread_info.h",
    "chars": 1435,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/detail/pipeline_impl.h",
    "chars": 16793,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/detail/quanta.cpp",
    "chars": 709,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/detail/quanta.h",
    "chars": 327,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/detail/result_of.h",
    "chars": 698,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/detail/rw_lock_impl.h",
    "chars": 4087,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/detail/small_buffer_allocator_impl.h",
    "chars": 8198,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/detail/task_set_impl.h",
    "chars": 7894,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/detail/timed_task_impl.h",
    "chars": 1712,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/dispenso.h",
    "chars": 1500,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/fast_math/README.md",
    "chars": 10753,
    "preview": "# dispenso::fast_math\n\n> **EXPERIMENTAL** — This sublibrary is under active development. The API is\n> unstable and subje"
  },
  {
    "path": "dispenso/fast_math/detail/double_promote.h",
    "chars": 20746,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/fast_math/detail/fast_math_impl.h",
    "chars": 14882,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/fast_math/fast_math.h",
    "chars": 85157,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/fast_math/float_traits.h",
    "chars": 3175,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/fast_math/float_traits_avx.h",
    "chars": 17446,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/fast_math/float_traits_avx512.h",
    "chars": 22213,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/fast_math/float_traits_hwy.h",
    "chars": 20169,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/fast_math/float_traits_neon.h",
    "chars": 18128,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/fast_math/float_traits_x86.h",
    "chars": 16968,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/fast_math/simd.h",
    "chars": 795,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/fast_math/util.h",
    "chars": 17449,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/for_each.h",
    "chars": 9604,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/future.h",
    "chars": 25790,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/graph.cpp",
    "chars": 5804,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/graph.h",
    "chars": 23026,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/graph_executor.cpp",
    "chars": 7518,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/graph_executor.h",
    "chars": 3018,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/latch.h",
    "chars": 1995,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/once_function.h",
    "chars": 4347,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/parallel_for.h",
    "chars": 33246,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/pipeline.h",
    "chars": 6581,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/platform.h",
    "chars": 8312,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/pool_allocator.cpp",
    "chars": 3430,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/pool_allocator.h",
    "chars": 2916,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/priority.cpp",
    "chars": 5221,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/priority.h",
    "chars": 1540,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/resource_pool.h",
    "chars": 4061,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/rw_lock.h",
    "chars": 3665,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/schedulable.h",
    "chars": 3023,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/small_buffer_allocator.cpp",
    "chars": 3436,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/small_buffer_allocator.h",
    "chars": 4470,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/small_vector.h",
    "chars": 12491,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/spsc_ring_buffer.h",
    "chars": 22938,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/task_set.cpp",
    "chars": 4892,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/task_set.h",
    "chars": 14713,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/third-party/moodycamel/LICENSE.md",
    "chars": 3266,
    "preview": "This license file applies to everything in this repository except that which\nis explicitly annotated as being written by"
  },
  {
    "path": "dispenso/third-party/moodycamel/README.txt",
    "chars": 1805,
    "preview": "https://github.com/cameron314/concurrentqueue\n\ncommit 65d6970912fc3f6bb62d80edf95ca30e0df85137 (HEAD -> master, origin/m"
  },
  {
    "path": "dispenso/third-party/moodycamel/blockingconcurrentqueue.h",
    "chars": 21515,
    "preview": "// Provides an efficient blocking version of moodycamel::ConcurrentQueue.\n// ©2015-2020 Cameron Desrochers. Distributed "
  },
  {
    "path": "dispenso/third-party/moodycamel/concurrentqueue.h",
    "chars": 152954,
    "preview": "// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free queue.\n// An overview, including benchm"
  },
  {
    "path": "dispenso/third-party/moodycamel/lightweightsemaphore.h",
    "chars": 11756,
    "preview": "// Provides an efficient implementation of a semaphore (LightweightSemaphore).\n// This is an extension of Jeff Preshing'"
  },
  {
    "path": "dispenso/thread_id.cpp",
    "chars": 621,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/thread_id.h",
    "chars": 906,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/thread_pool.cpp",
    "chars": 7879,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/thread_pool.h",
    "chars": 17220,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/timed_task.cpp",
    "chars": 3263,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/timed_task.h",
    "chars": 11967,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/timing.cpp",
    "chars": 4349,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/timing.h",
    "chars": 598,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/tsan_annotations.cpp",
    "chars": 1966,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/tsan_annotations.h",
    "chars": 3125,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/util.h",
    "chars": 8063,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "dispenso/utils/graph_dot.h",
    "chars": 2793,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "docs/Doxyfile",
    "chars": 109315,
    "preview": "# Doxyfile 1.8.14\n\n# This file describes the settings to be used by the documentation system\n# doxygen (www.doxygen.org)"
  },
  {
    "path": "docs/benchmarks/benchmark_results.md",
    "chars": 6001,
    "preview": "# Dispenso Benchmark Results\n\n## Machine Information\n\n- **Date**: 2026-02-05T15:59:21.769268\n- **Platform**: Linux 6.17."
  },
  {
    "path": "docs/benchmarks/concurrent_vector_details.md",
    "chars": 3004,
    "preview": "# concurrent_vector - Detailed Results\n\n| Benchmark | Time | Unit | Iterations |\n|-----------|------|------|------------"
  },
  {
    "path": "docs/benchmarks/concurrent_vector_tcmalloc_details.md",
    "chars": 3013,
    "preview": "# concurrent_vector_tcmalloc - Detailed Results\n\n| Benchmark | Time | Unit | Iterations |\n|-----------|------|------|---"
  },
  {
    "path": "docs/benchmarks/for_latency_details.md",
    "chars": 2387,
    "preview": "# for_latency - Detailed Results\n\n| Benchmark | Time | Unit | Iterations |\n|-----------|------|------|------------|\n| BM"
  },
  {
    "path": "docs/benchmarks/future_details.md",
    "chars": 1344,
    "preview": "# future - Detailed Results\n\n| Benchmark | Time | Unit | Iterations |\n|-----------|------|------|------------|\n| BM_seri"
  },
  {
    "path": "docs/benchmarks/graph_details.md",
    "chars": 869,
    "preview": "# graph - Detailed Results\n\n| Benchmark | Time | Unit | Iterations |\n|-----------|------|------|------------|\n| BM_build"
  },
  {
    "path": "docs/benchmarks/graph_scene_details.md",
    "chars": 389,
    "preview": "# graph_scene - Detailed Results\n\n| Benchmark | Time | Unit | Iterations |\n|-----------|------|------|------------|\n| BM"
  },
  {
    "path": "docs/benchmarks/idle_pool_details.md",
    "chars": 7729,
    "preview": "# idle_pool - Detailed Results\n\n| Benchmark | Time | Unit | Iterations |\n|-----------|------|------|------------|\n| BM_t"
  },
  {
    "path": "docs/benchmarks/index.html",
    "chars": 227030,
    "preview": "<!DOCTYPE html>\n<html lang=\"en\">\n<head>\n<meta charset=\"UTF-8\">\n<meta name=\"viewport\" content=\"width=device-width, initia"
  },
  {
    "path": "docs/benchmarks/nested_for_details.md",
    "chars": 7459,
    "preview": "# nested_for - Detailed Results\n\n| Benchmark | Time | Unit | Iterations |\n|-----------|------|------|------------|\n| BM_"
  },
  {
    "path": "docs/benchmarks/nested_pool_details.md",
    "chars": 7272,
    "preview": "# nested_pool - Detailed Results\n\n| Benchmark | Time | Unit | Iterations |\n|-----------|------|------|------------|\n| BM"
  },
  {
    "path": "docs/benchmarks/once_function_details.md",
    "chars": 1396,
    "preview": "# once_function - Detailed Results\n\n| Benchmark | Time | Unit | Iterations |\n|-----------|------|------|------------|\n| "
  },
  {
    "path": "docs/benchmarks/pipeline_details.md",
    "chars": 463,
    "preview": "# pipeline - Detailed Results\n\n| Benchmark | Time | Unit | Iterations |\n|-----------|------|------|------------|\n| BM_di"
  },
  {
    "path": "docs/benchmarks/pool_allocator_details.md",
    "chars": 4743,
    "preview": "# pool_allocator - Detailed Results\n\n| Benchmark | Time | Unit | Iterations |\n|-----------|------|------|------------|\n|"
  },
  {
    "path": "docs/benchmarks/rw_lock_details.md",
    "chars": 5549,
    "preview": "# rw_lock - Detailed Results\n\n| Benchmark | Time | Unit | Iterations |\n|-----------|------|------|------------|\n| BM_ser"
  },
  {
    "path": "docs/benchmarks/simple_for_details.md",
    "chars": 17192,
    "preview": "# simple_for - Detailed Results\n\n| Benchmark | Time | Unit | Iterations |\n|-----------|------|------|------------|\n| BM_"
  },
  {
    "path": "docs/benchmarks/simple_pool_details.md",
    "chars": 7322,
    "preview": "# simple_pool - Detailed Results\n\n| Benchmark | Time | Unit | Iterations |\n|-----------|------|------|------------|\n| BM"
  },
  {
    "path": "docs/benchmarks/small_buffer_details.md",
    "chars": 1812,
    "preview": "# small_buffer - Detailed Results\n\n| Benchmark | Time | Unit | Iterations |\n|-----------|------|------|------------|\n| B"
  },
  {
    "path": "docs/benchmarks/summing_for_details.md",
    "chars": 10552,
    "preview": "# summing_for - Detailed Results\n\n| Benchmark | Time | Unit | Iterations |\n|-----------|------|------|------------|\n| BM"
  },
  {
    "path": "docs/benchmarks/timed_task_details.md",
    "chars": 1075,
    "preview": "# timed_task - Detailed Results\n\n| Benchmark | Time | Unit | Iterations |\n|-----------|------|------|------------|\n| BM_"
  },
  {
    "path": "docs/benchmarks/trivial_compute_details.md",
    "chars": 10494,
    "preview": "# trivial_compute - Detailed Results\n\n| Benchmark | Time | Unit | Iterations |\n|-----------|------|------|------------|\n"
  },
  {
    "path": "docs/building.md",
    "chars": 3018,
    "preview": "# Building Dispenso\n\nDispenso uses CMake as its build system for open-source builds. Internally at Meta, the Buck\nbuild "
  },
  {
    "path": "docs/custom.css",
    "chars": 579,
    "preview": "/**\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found"
  },
  {
    "path": "docs/design/barrier_dispatch.md",
    "chars": 8607,
    "preview": "# Barrier-based Static Dispatch\n\n## Problem\n\nFor memory-bound workloads with small per-chunk compute (e.g. repeated sten"
  },
  {
    "path": "docs/design/coroutines.md",
    "chars": 20840,
    "preview": "# Dispenso Coroutine Support Design Document\n\n## Overview\n\nThis document outlines the design for C++20 coroutine integra"
  },
  {
    "path": "docs/design/cpp20_concepts.md",
    "chars": 6606,
    "preview": "# C++20 Concepts for Dispenso\n\n## Overview\n\nThis document outlines a plan to add C++20 concept constraints to dispenso's"
  },
  {
    "path": "docs/design/fast_math_roadmap.md",
    "chars": 15617,
    "preview": "# dispenso::fast_math Roadmap\n\nThis document tracks planned features and improvements for the fast_math sublibrary.\n\n## "
  },
  {
    "path": "docs/design/parallel_algorithms.md",
    "chars": 32066,
    "preview": "# Dispenso Parallel Algorithms Design Document\n\n## Overview\n\nThis document outlines the design for a set of parallel alg"
  },
  {
    "path": "docs/design/release_checklist.md",
    "chars": 854,
    "preview": "# Release Checklist\n\nPost-release tasks and reminders for package manager updates.\n\n## vcpkg: Remove temporary patches\n\n"
  },
  {
    "path": "docs/design/roadmap.md",
    "chars": 8510,
    "preview": "# Dispenso Roadmap\n\nThis document tracks planned features and improvements for the dispenso library.\n\n## In Progress\n\n| "
  },
  {
    "path": "docs/getting_started.md",
    "chars": 11845,
    "preview": "# Getting Started {#getting_started}\n\nThis guide walks through the core features of dispenso with working examples.\nEach"
  },
  {
    "path": "docs/groups.dox",
    "chars": 7521,
    "preview": "/**\n * @defgroup group_core Core Components\n * @brief Thread pools and task management fundamentals\n *\n * The core compo"
  },
  {
    "path": "docs/header.html",
    "chars": 2632,
    "preview": "<!-- HTML header for generated pages -->\n<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"https://www.w3."
  },
  {
    "path": "docs/mainpage.md",
    "chars": 1136,
    "preview": "# Dispenso {#mainpage}\n\n**A high-performance C++14 library for task parallelism**\n\nDispenso provides mechanisms for thre"
  },
  {
    "path": "docs/migrating_from_openmp.md",
    "chars": 9901,
    "preview": "# Migrating from OpenMP to Dispenso\n\nThis guide helps you migrate parallel code from OpenMP to dispenso. Dispenso offers"
  },
  {
    "path": "docs/migrating_from_tbb.md",
    "chars": 12555,
    "preview": "# Migrating from Intel TBB to Dispenso\n\nThis guide helps you migrate parallel code from Intel Threading Building Blocks "
  },
  {
    "path": "docs/third-party/doxygen-awesome/doxygen-awesome-darkmode-toggle.js",
    "chars": 7462,
    "preview": "// SPDX-License-Identifier: MIT\n/**\n\nDoxygen Awesome\nhttps://github.com/jothepro/doxygen-awesome-css\n\nCopyright (c) 2021"
  },
  {
    "path": "docs/third-party/doxygen-awesome/doxygen-awesome.css",
    "chars": 76636,
    "preview": "/* SPDX-License-Identifier: MIT */\n/**\n\nDoxygen Awesome\nhttps://github.com/jothepro/doxygen-awesome-css\n\nCopyright (c) 2"
  },
  {
    "path": "examples/CMakeLists.txt",
    "chars": 768,
    "preview": "# Copyright (c) Meta Platforms, Inc. and affiliates.\n#\n# This source code is licensed under the MIT license found in the"
  },
  {
    "path": "examples/concurrent_vector_example.cpp",
    "chars": 5270,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "examples/for_each_example.cpp",
    "chars": 2760,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "examples/future_example.cpp",
    "chars": 4452,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "examples/graph_example.cpp",
    "chars": 6452,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "examples/latch_example.cpp",
    "chars": 4524,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "examples/parallel_for_example.cpp",
    "chars": 2820,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "examples/pipeline_example.cpp",
    "chars": 5215,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "examples/resource_pool_example.cpp",
    "chars": 6081,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "examples/task_set_example.cpp",
    "chars": 4083,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "results/android_arm64.json",
    "chars": 422395,
    "preview": "{\n  \"machine_info\": {\n    \"timestamp\": \"2026-03-13T21:39:24+00:00\",\n    \"platform\": \"Android\",\n    \"platform_release\": \""
  },
  {
    "path": "results/linux_x64.json",
    "chars": 1081014,
    "preview": "{\n  \"machine_info\": {\n    \"timestamp\": \"2026-03-20T07:38:08.196941\",\n    \"platform\": \"Linux\",\n    \"platform_release\": \"6"
  },
  {
    "path": "results/macos_arm64.json",
    "chars": 545295,
    "preview": "{\n  \"machine_info\": {\n    \"timestamp\": \"2026-03-20T10:21:46.312656\",\n    \"platform\": \"Darwin\",\n    \"platform_release\": \""
  },
  {
    "path": "results/windows_x64.json",
    "chars": 765338,
    "preview": "{\n  \"machine_info\": {\n    \"timestamp\": \"2026-03-20T08:41:43.239873\",\n    \"platform\": \"Windows\",\n    \"platform_release\": "
  },
  {
    "path": "run_bench.bat",
    "chars": 879,
    "preview": "REM Copyright (c) Meta Platforms, Inc. and affiliates.\nREM\nREM This source code is licensed under the MIT license found "
  },
  {
    "path": "scripts/BENCHMARKING.md",
    "chars": 5353,
    "preview": "# Benchmark Generation Guide\n\nThis guide covers building dispenso benchmarks and generating performance\ncharts for the 1"
  },
  {
    "path": "scripts/compare_benchmarks.py",
    "chars": 8326,
    "preview": "#!/usr/bin/env python3\n# Copyright (c) Meta Platforms, Inc. and affiliates.\n#\n# This source code is licensed under the M"
  },
  {
    "path": "scripts/generate_charts.py",
    "chars": 79425,
    "preview": "#!/usr/bin/env python3\n# Copyright (c) Meta Platforms, Inc. and affiliates.\n#\n# This source code is licensed under the M"
  },
  {
    "path": "scripts/generate_plotly_benchmarks.py",
    "chars": 65925,
    "preview": "#!/usr/bin/env python3\n# Copyright (c) Meta Platforms, Inc. and affiliates.\n#\n# This source code is licensed under the M"
  },
  {
    "path": "scripts/run_benchmarks.py",
    "chars": 24506,
    "preview": "#!/usr/bin/env python3\n# Copyright (c) Meta Platforms, Inc. and affiliates.\n#\n# This source code is licensed under the M"
  },
  {
    "path": "scripts/update_benchmarks.py",
    "chars": 18394,
    "preview": "#!/usr/bin/env python3\n# Copyright (c) Meta Platforms, Inc. and affiliates.\n#\n# This source code is licensed under the M"
  },
  {
    "path": "scripts/update_package_managers.py",
    "chars": 78948,
    "preview": "#!/usr/bin/env python3\n# Copyright (c) Meta Platforms, Inc. and affiliates.\n#\n# This source code is licensed under the M"
  },
  {
    "path": "tests/CMakeLists.txt",
    "chars": 3439,
    "preview": "# Copyright (c) Meta Platforms, Inc. and affiliates.\n#\n# This source code is licensed under the MIT license found in the"
  },
  {
    "path": "tests/async_request_test.cpp",
    "chars": 1377,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "tests/chunked_for_test.cpp",
    "chars": 10100,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "tests/completion_event_test.cpp",
    "chars": 3324,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  },
  {
    "path": "tests/concurrent_object_arena_test.cpp",
    "chars": 7427,
    "preview": "/*\n * Copyright (c) Meta Platforms, Inc. and affiliates.\n *\n * This source code is licensed under the MIT license found "
  }
]

// ... and 68 more files (download for full content)

About this extraction

This page contains the full source code of the facebookincubator/dispenso GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 268 files (5.4 MB), approximately 1.4M tokens, and a symbol index with 2316 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo