Repository: facebookincubator/dispenso Branch: main Commit: c787a8423663 Files: 268 Total size: 5.4 MB Directory structure: gitextract_x94z546h/ ├── .clang-format ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ ├── bug_report.yml │ │ ├── config.yml │ │ └── feature_request.yml │ ├── PULL_REQUEST_TEMPLATE.md │ └── workflows/ │ ├── build.yml │ ├── codeql.yml │ └── docs.yml ├── .gitignore ├── CHANGELOG.md ├── CMakeLists.txt ├── CMakePresets.json ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── benchmarks/ │ ├── CMakeLists.txt │ ├── benchmark_common.h │ ├── cascading_parallel_for_benchmark.cpp │ ├── concurrent_vector_benchmark.cpp │ ├── fast_math/ │ │ ├── CMakeLists.txt │ │ ├── avx512_benchmarks.cpp │ │ ├── avx_benchmarks.cpp │ │ ├── benchmark_helpers.h │ │ ├── benchmarks.cpp │ │ ├── erf_benchmarks.cpp │ │ ├── hwy_benchmarks.cpp │ │ ├── neon_benchmarks.cpp │ │ └── sse_benchmarks.cpp │ ├── for_each_benchmark.cpp │ ├── for_latency_benchmark.cpp │ ├── future_benchmark.cpp │ ├── graph_benchmark.cpp │ ├── graph_scene_benchmark.cpp │ ├── idle_pool_benchmark.cpp │ ├── locality_benchmark.cpp │ ├── nested_for_benchmark.cpp │ ├── nested_pool_benchmark.cpp │ ├── once_function_benchmark.cpp │ ├── pipeline_benchmark.cpp │ ├── pool_allocator_benchmark.cpp │ ├── run_benchmarks.py │ ├── rw_lock_benchmark.cpp │ ├── simple_for_benchmark.cpp │ ├── simple_pool_benchmark.cpp │ ├── small_buffer_benchmark.cpp │ ├── summing_for_benchmark.cpp │ ├── tbb_compat.h │ ├── thread_benchmark_common.h │ ├── timed_task_benchmark.cpp │ └── trivial_compute_benchmark.cpp ├── cmake/ │ └── DispensoConfig.cmake.in ├── codecov.yml ├── dispenso/ │ ├── CMakeLists.txt │ ├── async_request.h │ ├── completion_event.h │ ├── concurrent_object_arena.h │ ├── concurrent_vector.h │ ├── detail/ │ │ ├── can_invoke.h │ │ ├── completion_event_impl.h │ │ ├── concurrent_vector_impl.h │ │ ├── concurrent_vector_impl2.h │ │ ├── epoch_waiter.h │ │ ├── future_impl.h │ │ ├── future_impl2.h │ │ ├── graph_executor_impl.h │ │ ├── math.h │ │ ├── notifier_common.h │ │ ├── once_callable_impl.h │ │ ├── op_result.h │ │ ├── per_thread_info.cpp │ │ ├── per_thread_info.h │ │ ├── pipeline_impl.h │ │ ├── quanta.cpp │ │ ├── quanta.h │ │ ├── result_of.h │ │ ├── rw_lock_impl.h │ │ ├── small_buffer_allocator_impl.h │ │ ├── task_set_impl.h │ │ └── timed_task_impl.h │ ├── dispenso.h │ ├── fast_math/ │ │ ├── README.md │ │ ├── detail/ │ │ │ ├── double_promote.h │ │ │ └── fast_math_impl.h │ │ ├── fast_math.h │ │ ├── float_traits.h │ │ ├── float_traits_avx.h │ │ ├── float_traits_avx512.h │ │ ├── float_traits_hwy.h │ │ ├── float_traits_neon.h │ │ ├── float_traits_x86.h │ │ ├── simd.h │ │ └── util.h │ ├── for_each.h │ ├── future.h │ ├── graph.cpp │ ├── graph.h │ ├── graph_executor.cpp │ ├── graph_executor.h │ ├── latch.h │ ├── once_function.h │ ├── parallel_for.h │ ├── pipeline.h │ ├── platform.h │ ├── pool_allocator.cpp │ ├── pool_allocator.h │ ├── priority.cpp │ ├── priority.h │ ├── resource_pool.h │ ├── rw_lock.h │ ├── schedulable.h │ ├── small_buffer_allocator.cpp │ ├── small_buffer_allocator.h │ ├── small_vector.h │ ├── spsc_ring_buffer.h │ ├── task_set.cpp │ ├── task_set.h │ ├── third-party/ │ │ └── moodycamel/ │ │ ├── LICENSE.md │ │ ├── README.txt │ │ ├── blockingconcurrentqueue.h │ │ ├── concurrentqueue.h │ │ └── lightweightsemaphore.h │ ├── thread_id.cpp │ ├── thread_id.h │ ├── thread_pool.cpp │ ├── thread_pool.h │ ├── timed_task.cpp │ ├── timed_task.h │ ├── timing.cpp │ ├── timing.h │ ├── tsan_annotations.cpp │ ├── tsan_annotations.h │ ├── util.h │ └── utils/ │ └── graph_dot.h ├── docs/ │ ├── Doxyfile │ ├── benchmarks/ │ │ ├── benchmark_results.md │ │ ├── concurrent_vector_details.md │ │ ├── concurrent_vector_tcmalloc_details.md │ │ ├── for_latency_details.md │ │ ├── future_details.md │ │ ├── graph_details.md │ │ ├── graph_scene_details.md │ │ ├── idle_pool_details.md │ │ ├── index.html │ │ ├── nested_for_details.md │ │ ├── nested_pool_details.md │ │ ├── once_function_details.md │ │ ├── pipeline_details.md │ │ ├── pool_allocator_details.md │ │ ├── rw_lock_details.md │ │ ├── simple_for_details.md │ │ ├── simple_pool_details.md │ │ ├── small_buffer_details.md │ │ ├── summing_for_details.md │ │ ├── timed_task_details.md │ │ └── trivial_compute_details.md │ ├── building.md │ ├── custom.css │ ├── design/ │ │ ├── barrier_dispatch.md │ │ ├── coroutines.md │ │ ├── cpp20_concepts.md │ │ ├── fast_math_roadmap.md │ │ ├── parallel_algorithms.md │ │ ├── release_checklist.md │ │ └── roadmap.md │ ├── getting_started.md │ ├── groups.dox │ ├── header.html │ ├── mainpage.md │ ├── migrating_from_openmp.md │ ├── migrating_from_tbb.md │ └── third-party/ │ └── doxygen-awesome/ │ ├── doxygen-awesome-darkmode-toggle.js │ └── doxygen-awesome.css ├── examples/ │ ├── CMakeLists.txt │ ├── concurrent_vector_example.cpp │ ├── for_each_example.cpp │ ├── future_example.cpp │ ├── graph_example.cpp │ ├── latch_example.cpp │ ├── parallel_for_example.cpp │ ├── pipeline_example.cpp │ ├── resource_pool_example.cpp │ └── task_set_example.cpp ├── results/ │ ├── android_arm64.json │ ├── linux_x64.json │ ├── macos_arm64.json │ └── windows_x64.json ├── run_bench.bat ├── scripts/ │ ├── BENCHMARKING.md │ ├── compare_benchmarks.py │ ├── generate_charts.py │ ├── generate_plotly_benchmarks.py │ ├── run_benchmarks.py │ ├── update_benchmarks.py │ └── update_package_managers.py └── tests/ ├── CMakeLists.txt ├── async_request_test.cpp ├── chunked_for_test.cpp ├── completion_event_test.cpp ├── concurrent_object_arena_test.cpp ├── concurrent_vector_a_test.cpp ├── concurrent_vector_b_test.cpp ├── concurrent_vector_default_test.cpp ├── concurrent_vector_nocache_test.cpp ├── concurrent_vector_test_common.h ├── concurrent_vector_test_common_types.h ├── fast_math/ │ ├── CMakeLists.txt │ ├── acos_test.cpp │ ├── asin_test.cpp │ ├── atan2_test.cpp │ ├── atan_test.cpp │ ├── avx512_test.cpp │ ├── avx_test.cpp │ ├── bivariate_ulp_eval.h │ ├── cbrt_test.cpp │ ├── cos_test.cpp │ ├── erf_test.cpp │ ├── eval.cpp │ ├── eval.h │ ├── exp10_test.cpp │ ├── exp2_test.cpp │ ├── exp_test.cpp │ ├── expm1_test.cpp │ ├── frexp_test.cpp │ ├── hwy_test.cpp │ ├── hypot_test.cpp │ ├── ldexp_test.cpp │ ├── log10_test.cpp │ ├── log1p_test.cpp │ ├── log2_test.cpp │ ├── log_test.cpp │ ├── neon_test.cpp │ ├── pow_test.cpp │ ├── pow_ulp_eval.cpp │ ├── simd_test_utils.h │ ├── sin_test.cpp │ ├── sincos_test.cpp │ ├── sinpi_test.cpp │ ├── sse_test.cpp │ ├── tan_test.cpp │ ├── tanh_test.cpp │ ├── test_main.cpp │ ├── ulp_eval.cpp │ └── util_test.cpp ├── for_each_test.cpp ├── forward_shared_pool.cpp ├── future_test.cpp ├── graph_test.cpp ├── greedy_for_ranges_test.cpp ├── greedy_for_test.cpp ├── latch_test.cpp ├── once_function_test.cpp ├── pipeline_test.cpp ├── pool_allocator_test.cpp ├── priority_test.cpp ├── resource_pool_test.cpp ├── rw_lock_test.cpp ├── shared_pool_test.cpp ├── small_buffer_allocator_test.cpp ├── small_vector_test.cpp ├── spsc_ring_buffer_test.cpp ├── task_set_test.cpp ├── test_tid.h ├── thread_id_test.cpp ├── thread_pool_test.cpp ├── timed_task_test.cpp ├── timing_test.cpp └── util_test.cpp ================================================ FILE CONTENTS ================================================ ================================================ FILE: .clang-format ================================================ --- AccessModifierOffset: -1 AlignAfterOpenBracket: AlwaysBreak AlignConsecutiveAssignments: false AlignConsecutiveDeclarations: false AlignEscapedNewlinesLeft: true AlignOperands: false AlignTrailingComments: false AllowAllParametersOfDeclarationOnNextLine: false AllowShortBlocksOnASingleLine: false AllowShortCaseLabelsOnASingleLine: false AllowShortFunctionsOnASingleLine: Empty AllowShortIfStatementsOnASingleLine: false AllowShortLoopsOnASingleLine: false AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: true AlwaysBreakTemplateDeclarations: true BinPackArguments: false BinPackParameters: false BraceWrapping: AfterClass: false AfterControlStatement: false AfterEnum: false AfterFunction: false AfterNamespace: false AfterObjCDeclaration: false AfterStruct: false AfterUnion: false BeforeCatch: false BeforeElse: false IndentBraces: false BreakBeforeBinaryOperators: None BreakBeforeBraces: Attach BreakBeforeTernaryOperators: true BreakConstructorInitializersBeforeComma: false BreakAfterJavaFieldAnnotations: false BreakStringLiterals: false ColumnLimit: 100 CommentPragmas: '^ IWYU pragma:' ConstructorInitializerAllOnOneLineOrOnePerLine: true ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 4 Cpp11BracedListStyle: true DerivePointerAlignment: false DisableFormat: false ForEachMacros: [ FOR_EACH_RANGE, FOR_EACH, ] IncludeCategories: - Regex: '^<.*\.h(pp)?>' Priority: 1 - Regex: '^<.*' Priority: 2 - Regex: '.*' Priority: 3 IndentCaseLabels: true IndentWidth: 2 IndentWrappedFunctionNames: false KeepEmptyLinesAtTheStartOfBlocks: false MacroBlockBegin: '' MacroBlockEnd: '' MaxEmptyLinesToKeep: 1 NamespaceIndentation: None ObjCBlockIndentWidth: 2 ObjCSpaceAfterProperty: false ObjCSpaceBeforeProtocolList: false PenaltyBreakBeforeFirstCallParameter: 1 PenaltyBreakComment: 300 PenaltyBreakFirstLessLess: 120 PenaltyBreakString: 1000 PenaltyExcessCharacter: 1000000 PenaltyReturnTypeOnItsOwnLine: 200 PointerAlignment: Left RawStringFormats: - Language: TextProto Delimiters: - pb ReflowComments: true SortIncludes: true SpaceAfterCStyleCast: false SpaceBeforeAssignmentOperators: true SpaceBeforeParens: ControlStatements SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 1 SpacesInAngles: false SpacesInContainerLiterals: true SpacesInCStyleCastParentheses: false SpacesInParentheses: false SpacesInSquareBrackets: false Standard: Cpp11 TabWidth: 4 UseTab: Never ... ================================================ FILE: .github/ISSUE_TEMPLATE/bug_report.yml ================================================ name: Bug Report description: File a bug report title: "[Bug]: " labels: ["bug", "triage"] assignees: - graphicsMan body: - type: markdown attributes: value: | Thanks for taking the time to fill out this bug report! - type: input id: contact attributes: label: Contact Details description: How can we get in touch with you if we need more info? placeholder: ex. email@example.com validations: required: false - type: textarea id: what-happened attributes: label: What happened? description: Also tell us, what did you expect to happen? placeholder: Tell us what you see! value: "A bug happened!" validations: required: true - type: dropdown id: version attributes: label: Version description: What version of our software are you running? options: - 1.0 (Default) - latest (Edge) validations: required: true - type: checkboxes id: terms attributes: label: Code of Conduct description: By submitting this issue, you agree to follow our [Code of Conduct](https://example.com) options: - label: I agree to follow this project's Code of Conduct required: true ================================================ FILE: .github/ISSUE_TEMPLATE/config.yml ================================================ blank_issues_enabled: false ================================================ FILE: .github/ISSUE_TEMPLATE/feature_request.yml ================================================ name: Feature Request description: File a feature request title: "[Feature Request]: " labels: ["feature", "request"] assignees: - graphicsMan body: - type: markdown attributes: value: | Thanks for taking the time to fill out this feature request! - type: input id: contact attributes: label: Contact Details description: How can we get in touch with you if we need more info? placeholder: ex. email@example.com validations: required: false - type: textarea id: whats-wanted attributes: label: What is the desired feature? description: Give some details value: "Details here" validations: required: true - type: checkboxes id: terms attributes: label: Code of Conduct description: By submitting this issue, you agree to follow our [Code of Conduct](https://example.com) options: - label: I agree to follow this project's Code of Conduct required: true ================================================ FILE: .github/PULL_REQUEST_TEMPLATE.md ================================================ # PR Details ## Description ## Related Issue ## Motivation and Context ## Test Plan ## Types of changes - [ ] Docs change - [ ] Refactoring - [ ] Dependency upgrade - [ ] Bug fix (non-breaking change which fixes an issue) - [ ] New feature (non-breaking change which adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) ## Checklist - [ ] My code follows the code style of this project. - [ ] I have run clang-format. - [ ] My change requires a change to the documentation. - [ ] I have updated the documentation accordingly. - [ ] I have read the **CONTRIBUTING** document. - [ ] I have added tests to cover my changes. - [ ] All new and existing tests passed, including in ASAN and TSAN modes (if available on your platform). ================================================ FILE: .github/workflows/build.yml ================================================ name: Build and test on: push: branches: [main] pull_request: branches: [main] permissions: contents: read env: CTEST_OUTPUT_ON_FAILURE: 1 jobs: ############################################################################# # Core platform builds - strategic sampling of OS/compiler/standard/library ############################################################################# build-matrix: name: ${{ matrix.name }} runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: include: # Linux - GCC baseline - name: Linux GCC C++14 os: ubuntu-latest cmake_args: -DCMAKE_CXX_STANDARD=14 # Linux - Clang with C++20 concepts - name: Linux Clang C++20 os: ubuntu-latest cc: clang cxx: clang++ cmake_args: -DCMAKE_CXX_STANDARD=20 # macOS - C++20 with static library - name: macOS C++20 static os: macos-latest cmake_args: -DCMAKE_CXX_STANDARD=20 -DDISPENSO_SHARED_LIB=OFF # Windows - MSVC baseline - name: Windows MSVC C++14 os: windows-latest cmake_args: -DCMAKE_CXX_STANDARD=14 # Windows - MSVC C++20 with static library - name: Windows MSVC C++20 static os: windows-latest cmake_args: -DCMAKE_CXX_STANDARD=20 -DDISPENSO_SHARED_LIB=OFF steps: - uses: actions/checkout@v4 - name: Configure run: | mkdir build && cd build cmake .. -DDISPENSO_BUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Release ${{ matrix.cmake_args }} env: CC: ${{ matrix.cc }} CXX: ${{ matrix.cxx }} - name: Build working-directory: ./build run: cmake --build . --parallel 4 --config Release - name: Test working-directory: ./build run: ctest -LE flaky --build-config Release ############################################################################# # Architecture-specific builds ############################################################################# build-linux-x86: name: Linux x86 (32-bit) runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Install 32-bit support run: sudo apt-get update && sudo apt-get install -y gcc-multilib g++-multilib - name: Configure run: | mkdir build && cd build cmake .. -DDISPENSO_BUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_FLAGS="-m32" -DCMAKE_C_FLAGS="-m32" - name: Build working-directory: ./build run: cmake --build . --parallel 4 - name: Test working-directory: ./build run: ctest -LE flaky build-macos-gcc: name: macOS GCC C++14 (ARM64) runs-on: macos-latest steps: - uses: actions/checkout@v4 - name: Set up GCC run: | brew list gcc || brew install gcc GCC_PREFIX=$(brew --prefix gcc) GCC_VER=$(ls "$GCC_PREFIX/bin"/gcc-* | grep -oE '[0-9]+$' | sort -n | tail -1) echo "CC=$GCC_PREFIX/bin/gcc-$GCC_VER" >> $GITHUB_ENV echo "CXX=$GCC_PREFIX/bin/g++-$GCC_VER" >> $GITHUB_ENV - name: Configure run: | mkdir build && cd build cmake .. -DDISPENSO_BUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_STANDARD=14 - name: Build working-directory: ./build run: cmake --build . --parallel 4 - name: Test working-directory: ./build run: ctest -LE flaky build-windows-x86: name: Windows x86 (32-bit) runs-on: windows-latest steps: - uses: actions/checkout@v4 - name: Configure run: | mkdir build && cd build cmake .. -A Win32 -DDISPENSO_BUILD_TESTS=ON - name: Build working-directory: ./build run: cmake --build . --parallel 4 --config Release - name: Test working-directory: ./build run: ctest -LE flaky --build-config Release ############################################################################# # Sanitizer builds - critical for concurrency library ############################################################################# sanitizers: name: ${{ matrix.name }} runs-on: ubuntu-latest strategy: fail-fast: false matrix: include: # TSan requires clang - GCC 13's TSan has spurious warnings about atomic_thread_fence - name: Thread Sanitizer cmake_args: -DTHREAD_SANITIZER=ON cc: clang cxx: clang++ - name: Address Sanitizer cmake_args: -DADDRESS_SANITIZER=ON steps: - uses: actions/checkout@v4 - name: Configure run: | mkdir build && cd build cmake .. -DDISPENSO_BUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Debug ${{ matrix.cmake_args }} env: CC: ${{ matrix.cc }} CXX: ${{ matrix.cxx }} - name: Build working-directory: ./build run: cmake --build . --parallel 4 - name: Test working-directory: ./build run: ctest -LE flaky ############################################################################# # Documentation build ############################################################################# docs: name: Documentation runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Install Doxygen run: sudo apt-get update && sudo apt-get install -y doxygen graphviz - name: Build documentation working-directory: ./docs run: doxygen Doxyfile - name: Check for warnings working-directory: ./docs run: | if [ -s doxygen_warnings.log ]; then echo "Doxygen warnings found:" cat doxygen_warnings.log exit 1 fi echo "No Doxygen warnings." ############################################################################# # Code coverage ############################################################################# coverage: name: Code Coverage runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Install lcov run: sudo apt-get update && sudo apt-get install -y lcov - name: Configure with coverage run: | mkdir build && cd build cmake .. -DDISPENSO_BUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Debug \ -DCMAKE_CXX_FLAGS="--coverage -fprofile-arcs -ftest-coverage -fprofile-update=atomic" \ -DCMAKE_C_FLAGS="--coverage -fprofile-arcs -ftest-coverage -fprofile-update=atomic" \ -DCMAKE_EXE_LINKER_FLAGS="--coverage" - name: Build working-directory: ./build run: cmake --build . --parallel 4 - name: Test (stable tests) working-directory: ./build run: ctest -LE flaky - name: Test (TimedTask, with retries) working-directory: ./build run: ctest -L flaky -R "^TimedTaskTest\." --repeat until-pass:5 - name: Generate coverage report run: | lcov --capture --directory build --output-file coverage.info --ignore-errors mismatch,gcov,negative lcov --remove coverage.info '/usr/*' '*/build/_deps/*' '*/tests/*' '*/third-party/*' '*/benchmarks/*' '*/examples/*' '*/bits/*' '*/ext/*' --output-file coverage.info --ignore-errors unused lcov --list coverage.info - name: Upload to Codecov uses: codecov/codecov-action@v5 with: files: coverage.info token: ${{ secrets.CODECOV_TOKEN }} fail_ci_if_error: false verbose: true ================================================ FILE: .github/workflows/codeql.yml ================================================ name: CodeQL on: push: branches: [main] pull_request: branches: [main] schedule: - cron: '17 9 * * 1' jobs: analyze: name: Analyze (C/C++) runs-on: ubuntu-latest permissions: security-events: write contents: read steps: - uses: actions/checkout@v4 - name: Initialize CodeQL uses: github/codeql-action/init@v4 with: languages: c-cpp - name: Build run: | cmake -S . -B build \ -DDISPENSO_BUILD_TESTS=OFF \ -DDISPENSO_BUILD_BENCHMARKS=OFF \ -DCMAKE_BUILD_TYPE=Release cmake --build build --parallel 4 - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@v4 ================================================ FILE: .github/workflows/docs.yml ================================================ name: Docs on: push: branches: [ main ] permissions: contents: write jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Set version from CHANGELOG run: | VERSION=$(grep -m1 '^[0-9]' CHANGELOG.md | sed 's/ .*//') sed -i "s/^PROJECT_NUMBER.*/PROJECT_NUMBER = $VERSION/" docs/Doxyfile - name: Doxygen Action uses: mattnotmitt/doxygen-action@v1 with: working-directory: "docs/" doxyfile-path: "./Doxyfile" - name: Fix Doxygen output permissions and copy benchmark dashboard run: | sudo chown -R $(id -u):$(id -g) docs/doxygen cp -r docs/benchmarks docs/doxygen/html/benchmarks - name: Deploy uses: peaceiris/actions-gh-pages@v4 with: github_token: ${{ secrets.GITHUB_TOKEN }} publish_dir: ./docs/doxygen/html ================================================ FILE: .gitignore ================================================ # Compiled source # ################### *.com *.class *.dll *.exe *.o *.so *.a bin lib # Packages # ############ # it's better to unpack these files and commit the raw source # git has its own built in compression methods *.7z *.dmg *.gz *.iso *.jar *.rar *.tar *.zip /.project # generated cmake files # ######################### *CMakeCache.txt *.log *.make *.cmake CMakeFiles Makefile *Dir /build/* docs/doxygen/ # Clang # ######### .cache/ ================================================ FILE: CHANGELOG.md ================================================ 1.5.1 (March 28, 2026) ### Bug fixes * Fixed `__ulock_wait`/`__ulock_wake` usage on macOS versions prior to 10.12 and on PowerPC where these APIs are unavailable. The ulock path is now guarded behind a runtime version check with `pthread_cond` fallback. * Fixed ARM64 Windows build failure: `notifier_common.h` incorrectly defined `_ARM_` (32-bit ARM) instead of `_ARM64_` on ARM64 Windows, causing `winnt.h` to reference missing 32-bit ARM intrinsics. * Fixed `platform.h` version macros not being updated for 1.5.0 release (were stuck at 1.4.1) * Removed vestigial `CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS` from CMakeLists.txt. All public APIs now use proper `DISPENSO_DLL_ACCESS` annotations; the blanket export is no longer needed and is prohibited by vcpkg's maintainer guide. ### Build system * Added `DISPENSO_USE_SYSTEM_CONCURRENTQUEUE` CMake option to use system-installed `moodycamel::concurrentqueue` instead of bundled copy (default OFF), for vcpkg compatibility * Export C++ standard requirement via `target_compile_features` so downstream consumers compile with at least the same standard dispenso was built with * Respect `BUILD_SHARED_LIBS` for `DISPENSO_SHARED_LIB` default, allowing vcpkg to control static/shared linkage ### Infrastructure * Added package manager release automation script (`scripts/update_package_managers.py`) with post-write checksum verification, platform-aware testing, and PR body templates following each repo's CONTRIBUTING.md * Added CodeQL security analysis workflow scoped to main branch * Added package manager badges (vcpkg, Conan, Homebrew, MacPorts) to README * Added release checklist documentation 1.5.0 (March 22, 2026) ### New features * Added `SmallVector` container with configurable inline storage, reducing heap allocations for small collections * Added `SPSCRingBuffer` lock-free single-producer single-consumer ring buffer with power-of-two optimization * Added `scheduleBulk(count, generator)` API to ThreadPool, TaskSet, and ConcurrentTaskSet for efficient bulk task submission with reduced atomic contention * Added random-access iterator specialization for `for_each_n`, with iterator category dispatch for optimal chunk boundary computation * Added Mac futex-based wakeup using `os_sync_wait_on_address` (macOS 14.4+) with `__ulock_wait` fallback * Added C++20 concept constraints for better error messages when template requirements aren't met * Added experimental `fast_math` sublibrary with SIMD-accelerated math functions including `log2`, `exp2`, `exp`, `exp10`, `cbrt`, `sin`, `cos`, `sincos`, `asin`, and `atan2` with configurable accuracy/performance trade-offs and multiple SIMD backends (SSE4.1, AVX2, AVX512, NEON, Highway). **API unstable** — gated by `DISPENSO_BUILD_FAST_MATH` CMake option * Added benchmark runner and chart generation scripts with multi-platform support * Added interactive Plotly.js benchmark dashboard generator ### Performance improvements * ThreadPool atomics simplification: replaced 3 per-task tracking atomics with `numSleeping_` + batched `workRemaining_` decrements, reducing per-task atomic operations from 5-6 to ~1 (+24% geometric mean across 568 benchmark tests) * ThreadPool wakeup heuristic: reduced futex calls from ~1M to ~11K by only waking when capacity cannot cover queued work; `mostly_idle` benchmark 2.6x faster * Cache-line alignment for `poolLoadFactor_` and `numThreads_` to reduce false sharing (L1 cache miss rate 16.33% → 6.92% on `schedule()` hot path) * Graph executor optimizations: `SmallVector` for node dependents, pre-reserve capacity, inline continuation (build_big_tree 1.95x faster, build_dep_chain 2.14x faster) * Serial pipeline SPSC optimization: dedicated executor with ring buffers for fully-serial pipelines (~33% faster) * Inline continuation for serial pipeline stages (scheduling overhead reduced ~3x) * Bulk wakeup with threshold-based `wakeN()`/`wakeAll()` selection for efficient bulk scheduling * `for_each_n` converted to `scheduleBulk`: 2.1x faster at 32 threads, 1.6x at 64 threads for 100M elements * `parallel_for` kAuto bulk scheduling: trivial_compute 52ms → 19ms at 192 threads (matching TBB) * ConcurrentVector: non-atomic buffer pointer cache for read-hot paths (disabled on ARM), inline asm `bsr` for `detail::log2` on x86, and platform-adaptive `bucketAndSubIndexForIndex` fast path * `OnceFunction` devirtualized: replaced vtable-based dispatch with direct function pointer, eliminating indirect call overhead * `TaskSet`/`ConcurrentTaskSet` noWait path: replaced `shared_ptr` with pool-allocated single-atomic chunk index, reducing allocation overhead ### Infrastructure * Benchmark runner (`run_benchmarks.py`) with JSON output and machine info collection * Chart generator (`generate_charts.py`) with specialized visualizations per benchmark suite * Multi-platform benchmark composition (`update_benchmarks.py --compose`) for unified documentation * Prefer system-installed GoogleTest, Taskflow, and TBB in CMake with FetchContent fallback * Added oneTBB compatibility via `tbb_compat.h` wrapper for `task_scheduler_init` * Added BUCK targets for `idle_pool_benchmark`, `nested_pool_benchmark`, `for_each_benchmark`, and `locality_benchmark` ### Documentation * Added examples directory with compilable example programs for each feature * Added Getting Started guide (`docs/getting_started.md`) with inline code snippets from examples * Added OpenMP migration guide (`docs/migrating_from_openmp.md`) * Improved README clarity, discoverability, and feature descriptions ### CI and build improvements * Comprehensive CI matrix: 11 jobs covering 3 architectures (x64, x86, ARM64), 3 OSes, 3 compilers (GCC, Clang, MSVC), C++14/20, TSan/ASan, code coverage, and Doxygen builds * Added codecov.yml for enforcing 92% code coverage threshold ### Bug fixes * Fixed ABI mismatch between exception and no-exception builds: `TaskSetBase` and `FutureImplResultMember` had conditionally compiled members that shifted struct layout depending on `-fno-exceptions`, causing crashes when translation units disagreed. Exception-related data members are now always present in the layout (with zero runtime cost when exceptions are disabled). **Note:** this changes the ABI for builds that previously used `-fno-exceptions`; recompile all code against 1.5 headers if mixing exception modes. * Fixed `ConcurrentTaskSet` parent stack overflow when tasks recursively schedule to the same task set: self-recursive inlining via `tryExecuteNext()` repeatedly pushed the same `TaskSetBase*` onto the thread-local parent stack (depth limit 64), causing an abort under heavy inlining. Fix skips redundant push/pop when the TaskSet is already the current parent. * Fixed pipeline `kLimited` scheduler `wait()` losing late-arriving items: the LIMITED path only drained the local queue without waiting for in-flight items, so items enqueued by a previous stage's CTS task after the drain could be permanently orphaned. Fix replaces the drain-only loop with an `outstanding_`-based spin that ensures all items complete, with `tryExecuteNext()` to keep the calling thread productive. * Fixed `parallel_for` with `kAuto` chunking incorrectly falling back to static chunking when `maxThreads` was left at default * Fixed `NoOpIter` missing iterator trait typedefs for C++20 compliance * Fixed `NoOpIter::operator*()` / `operator[]` static local data race * Fixed SmallBufferAllocator unsigned underflow where `allocSmallBuffer<1/2/3>()` returned nullptr instead of a 4-byte block * Fixed `cpuRelax()` being a no-op on MSVC (missing `_mm_pause()` / `__yield()` intrinsics) * Fixed x86 Windows build issues * Fixed Doxygen documentation warnings * Fixed pipeline exception safety: exceptions thrown in pipeline stage functors are now caught and propagated to the caller via `ConcurrentTaskSet`. Added RAII guards for stage resource cleanup, `OnceFunction::cleanupNotRun()` for proper deallocation of unexecuted tasks, and a deadlock fix in the `kLimited` scheduler's resource spin loop when exceptions leave no threads to release resources. * Fixed MSVC lambda capture for constexpr variable * Fixed `idle_pool_benchmark` fairness (loop bound, static scheduling, and pool placement) * Fixed `nested_for_benchmark` incorrect loop bound, static scheduling, and pool placement ### Test improvements * Added comprehensive tests for thread_pool spin-poll with sleep mode * Added comprehensive task_set edge case tests * Added comprehensive tests for concurrent_object_arena * Added edge case tests for pool_allocator * Added timing tests for getTime() function * Added tests for Graph/Subgraph accessors and BiPropNode edge cases * Added `SmallVector` test suite (43 tests) * Added `SPSCRingBuffer` test suite (47 tests) * Improved overall test coverage from ~89% to 96.3% (dispenso source only, excluding stdlib and third-party) 1.4.1 (January 5, 2026) ### Bug fixes and build improvements * Fixed clock frequency calculation for mac-arm platforms * Addressed potential race condition at TimedTaskScheduler construction * Adjusted build platforms for better compatibility 1.4 (January 2, 2025) ### Efficiency improvements, bug and warning fixes * Added some benchmarks and comparison with TaskFlow (thanks andre-nguyen!) * Fixed compilation when compiling with DISPENSO_DEBUG (thanks EscapeZero!) * Improved efficiency on Linux for infrequent thread pool usage. Reduces polling overhead by 10x by switching to event-based wakeup instead of spin polling. * Fix C++20 compilation issues (thanks aavbsouza!) * Fix several build warnings (thanks SeaOtocinclus!) * Add conda package badge, disable gtest install (thanks JeongSeok Lee!) * Solved rare post-main shutdown issues with NewThreadInvoker * Fixed test issues for 32-bit builds * Fixed broken test logic for test thread IDs * Fixed various build warnings 1.3 (April 25, 2024) ### Bug fixes, portability enhancements, and small functionality enhancements * Fixed several generic warnings (thanks michel-slm!) * cpuRelax added for PowerPC and ARM (thanks barracuda156!) * Added missing header (thanks ryandesign!) * Try to detect and add libatomic when required (thanks for discussions barracuda156!) * Enable small buffers from small buffer allocators to go down to 4 bytes (thanks for discussion David Caruso!). This is handy for 32-bit builds where pointers are typically 4 bytes * Ensure that NOMINMAX is propagated for CMake Windows builds (thanks SeaOtocinclus!) * Fix some cases using std::make_shared for types requiring large alignment, which is a bug prior to C++17 (thanks for help finding these SeaOtocinclus!) * Set up CI on GitHub Actions, including builds for Mac and Windows in addition to Linux (thanks SeaOtocinclus!) * Add an envinronment variable `DISPENSO_MAX_THREADS_PER_POOL` to limit max number of threads available to any thread pool. In the spirit of `OMP_NUM_THREADS`. (thanks Yong-Chull Jang!) * Slight change of behavior w.r.t. use of `maxThreads` option in `ForEachOptions` and `ParForOptions` to limit concurrency the same way in both blocking and non-blocking `for_each` and `parallel_for` (thanks Arnie Yuan!) * Various fixes to enable CMake builds on various 32-bit platforms (thanks for discussions barracuda156!) * Updates to README Known Issues: * Large subset of dispenso tests are known to fail on 32-bit PPC Mac. If you have access to such a machine and are willing to help debug, it would be appreciated! * NewThreadInvoker can have a program shutdown race on Windows platforms if the threads launched by it are not finished running by end of main() 1.2 (December 27, 2023) ### Bug fixes and functionality enhancements * Several small bug fixes, especially around 32-bit builds and at-exit shutdown corner cases, and TSAN finding benign races and/or causing timeout due to pathological lock-free behaviors in newer versions of TSAN * Improve accuracy of `dispenso::getTime` * Add C++-20-like `Latch` functionality * Add mechanism for portable thread priorities * Add a timed task/periodically scheduled task feature. Average and standard deviation of the accuracy of `dispenso::TimedTaskScheduler` are both much better than `folly::FunctionScheduler` (from 2x to 10x+ depending on settings and platform) * Enhancements to `parallel_for` * Add an option that allows to automatically reduce the number of threads working on a range if the work is too cheap to justify parallelization. This can result in 3000x+ speedups for very lightweight loops * Resuse per-thread state containers across parallel for calls (these must block in-between, or be thread-safe types) * `parallel_for` functors may now be called with an input range directly instead of requiring a ChunkedRange. This is as simple as providing a functor/lambda that takes the additional argument, just as was previously done with `ChunkedRange`. `ChunkedRange`s still work, and this is fully backward compatible * `ThreadPool`s have a new option for full spin polling. This is generally best avoided, and I'd argue never to use this for the default Global thread pool, but can be useful for a subset of threads in systems that require real-time responsivity (especially, can be combined with the thread priority feature also found in this release) * Task graph execution (thanks Roman Fedotov!). Building and running dispenso task graphs is typically 25% faster than the (already excellent) `TaskFlow` library in our benchmarks. Additionally, we have a partial update feature that can enable much faster (e.g. 50x faster) execution in cases where only a small percentage of task inputs are updated (think of per-frame partial scene updates in a game) 1.1 (October 1, 2022) ### Performance and functionality enhancements * CMake changes to allow install of targets and CMake dispenso target exports (thanks jeffamstutz!) * Addition of typical container type definitions for ConcurrentVector (thanks Michael Jung!) * Large performance improvements for Futures and CompletionEvents on MacOs. Resulted in order-of-magnitude speedups for those use cases on MacOs. * Addition of new benchmark for performance with infrequent use of `parallel_for`, `for_latency_benchmark` * Fixes to ensure `parallel_for` works with thread pools with zero threads (thanks kevinbchen!). Further work has been done to ensure that thread pools with zero threads simply always run code inline. * By default, the global thread pool uses one fewer thread than the machine has hardware threads. This behavior was introduced because dispenso very often runs on the calling thread as well as pool threads, and so one fewer thread in the pool can lead to better performance. * Update googletest version to 1.12.1 (thanks porumbes!) * Add a utility in dispenso to get a thread ID, `threadId`. These 64-bit IDs are unique per thread, and will not be recyled. These values grow from zero, ensuring the caller can assume they are small if number of threads also is small (e.g. you won't have an ID of `0xdeadbeef` if you only run hundreds or thousands of threads in the lifetime of the process). * Add a utility, `getTime`, to get time quickly. This provides the double-precision time in seconds since the first call to `getTime` after process start. * Use a new scheduling mechanism in the thread pool when in Windows. This resulted in up to a 13x improvement in latency between putting items in the pool and having those items run. This scheduling is optional, but turned off for Linux and MacOs since scheduling was already fast on those platforms. * Optimizations to enable faster scheduling in thread pools. This resulted in a range of 5% to 45% speedup across multiple benchmarks including `future_benchmark` and `pipeline_benchmark`. * Fixed a performance bug in work stealing logic; now dispenso outperforms TBB in the `pipeline_benchmark` * Added a task set cancellation feature, with a relatively simple mechanism for submitted work to check if it's owning task set has been cancelled. When creating a task set, you can optionally opt into parent cancellation propagation as well. While this propagation is fairly efficient, it did create a noticeable impact on performance in some cases, and thus it was decided to allow this behavior, but not penalize performance for those who don't need the behavior. 1.0 (November 24, 2021) ### dispenso initial release ================================================ FILE: CMakeLists.txt ================================================ # Copyright (c) Meta Platforms, Inc. and affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. cmake_minimum_required(VERSION 3.12) # Use /Z7 (embedded debug info) instead of /Zi (PDB server) on MSVC+Ninja. # /Zi requires mspdbsrv.exe which can fail with C1902 "Program database manager # mismatch" when Ninja launches cl.exe as a subprocess. if(POLICY CMP0141) cmake_policy(SET CMP0141 NEW) set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT "$<$:Embedded>") endif() # CMake 4.x enables C++20 module scanning by default with Ninja, which breaks # try_compile() checks. Dispenso doesn't use C++20 modules. if(POLICY CMP0155) cmake_policy(SET CMP0155 OLD) endif() set(CMAKE_CXX_SCAN_FOR_MODULES OFF) project( Dispenso VERSION 1.5.1 DESCRIPTION "Dispenso is a library for working with sets of parallel tasks" LANGUAGES CXX) if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") set(DISPENSO_STANDALONE TRUE) else() set(DISPENSO_STANDALONE FALSE) endif() if (DISPENSO_STANDALONE) include(GNUInstallDirs) endif() list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/modules) # Main project setup if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) # Default to BUILD_SHARED_LIBS when set (e.g. by vcpkg), otherwise ON. if(DEFINED BUILD_SHARED_LIBS) option(DISPENSO_SHARED_LIB "Build Dispenso shared library" ${BUILD_SHARED_LIBS}) else() option(DISPENSO_SHARED_LIB "Build Dispenso shared library" ON) endif() endif() option(ADDRESS_SANITIZER "Use Address Sanitizer, incompatible with THREAD_SANITIZER" OFF) option(THREAD_SANITIZER "Use Thread Sanitizer, incompatible with ADDRESS_SANITIZER" OFF) if (ADDRESS_SANITIZER) add_compile_options(-fsanitize=address -fsanitize=undefined) add_link_options(-fsanitize=address -fsanitize=undefined) elseif (THREAD_SANITIZER) add_compile_options(-fsanitize=thread) add_link_options(-fsanitize=thread) endif() set(CMAKE_CXX_STANDARD 14 CACHE STRING "the C++ standard to use for this project") set(DISPENSO_USE_SYSTEM_CONCURRENTQUEUE OFF CACHE BOOL "Use system-installed moodycamel::concurrentqueue instead of bundled copy") ########################################################### # Targets add_subdirectory(dispenso) set(DISPENSO_BUILD_TESTS OFF CACHE BOOL "Should tests be built?") set(DISPENSO_BUILD_BENCHMARKS OFF CACHE BOOL "Should benchmarks be built?") set(DISPENSO_BUILD_EXAMPLES OFF CACHE BOOL "Should examples be built?") set(DISPENSO_BUILD_FAST_MATH OFF CACHE BOOL "Build experimental fast_math sublibrary (API unstable)") set(DISPENSO_FAST_MATH_SIMD "none" CACHE STRING "SIMD ISA for fast_math targets: none, native, sse4.1, avx2, avx512, neon") set(DISPENSO_FAST_MATH_HIGHWAY OFF CACHE BOOL "Enable Highway SIMD backend for fast_math (fetches Highway if not found)") if(DISPENSO_BUILD_FAST_MATH) # Set compiler flags for the chosen SIMD backend. # Each level is cumulative: avx2 implies sse4.1, avx512 implies avx2+sse4.1. # "native" auto-detects all ISA extensions supported by the build machine. set(DISPENSO_FAST_MATH_SIMD_FLAGS "") if(DISPENSO_FAST_MATH_SIMD STREQUAL "native") if(MSVC) # MSVC doesn't have -march=native; AVX2 is the best portable flag. set(DISPENSO_FAST_MATH_SIMD_FLAGS /arch:AVX2) else() set(DISPENSO_FAST_MATH_SIMD_FLAGS -march=native) endif() elseif(DISPENSO_FAST_MATH_SIMD STREQUAL "sse4.1") if(NOT MSVC) # MSVC enables SSE4.1 by default on x64. set(DISPENSO_FAST_MATH_SIMD_FLAGS -msse4.1) endif() elseif(DISPENSO_FAST_MATH_SIMD STREQUAL "avx2") if(MSVC) set(DISPENSO_FAST_MATH_SIMD_FLAGS /arch:AVX2) else() set(DISPENSO_FAST_MATH_SIMD_FLAGS -msse4.1 -mavx2 -mfma) endif() elseif(DISPENSO_FAST_MATH_SIMD STREQUAL "avx512") if(MSVC) set(DISPENSO_FAST_MATH_SIMD_FLAGS /arch:AVX512) else() set(DISPENSO_FAST_MATH_SIMD_FLAGS -msse4.1 -mavx2 -mfma -mavx512f -mavx512bw -mavx512dq) endif() elseif(DISPENSO_FAST_MATH_SIMD STREQUAL "neon") # NEON is implicit on aarch64; no extra flags needed. elseif(NOT DISPENSO_FAST_MATH_SIMD STREQUAL "none") message(FATAL_ERROR "Unknown DISPENSO_FAST_MATH_SIMD value: ${DISPENSO_FAST_MATH_SIMD}. " "Valid options: none, native, sse4.1, avx2, avx512, neon") endif() # Find or fetch Highway if requested. if(DISPENSO_FAST_MATH_HIGHWAY) find_package(hwy QUIET) if(NOT hwy_FOUND) include(FetchContent) FetchContent_Declare( highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG 1.2.0 ) set(HWY_ENABLE_TESTS OFF CACHE BOOL "" FORCE) set(HWY_ENABLE_EXAMPLES OFF CACHE BOOL "" FORCE) set(HWY_ENABLE_CONTRIB OFF CACHE BOOL "" FORCE) FetchContent_MakeAvailable(highway) endif() endif() endif() if(DISPENSO_BUILD_TESTS) enable_testing() add_subdirectory(tests) endif() if(DISPENSO_BUILD_BENCHMARKS) # Sadly any given release of folly seems to have some problem or another. Leave disabled by default. set(BENCHMARK_WITHOUT_FOLLY ON CACHE BOOL "Should folly benchmarks be disabled?") add_subdirectory(benchmarks) endif() if(DISPENSO_BUILD_EXAMPLES) add_subdirectory(examples) endif() ================================================ FILE: CMakePresets.json ================================================ { "version": 6, "cmakeMinimumRequired": { "major": 3, "minor": 21, "patch": 0 }, "configurePresets": [ { "name": "default", "displayName": "Default (system packages)", "description": "Basic build without vcpkg. Uses system-installed dependencies.", "binaryDir": "${sourceDir}/build/${presetName}", "cacheVariables": { "CMAKE_CXX_STANDARD": "17" } }, { "name": "vcpkg", "displayName": "vcpkg", "description": "Use vcpkg for dependencies (TBB, GoogleTest, benchmark, etc.)", "binaryDir": "${sourceDir}/build/${presetName}", "toolchainFile": "$env{VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake", "cacheVariables": { "CMAKE_CXX_STANDARD": "17" } }, { "name": "dev", "displayName": "Development (tests + benchmarks)", "description": "Build everything for development: tests, benchmarks, and examples.", "inherits": "vcpkg", "cacheVariables": { "DISPENSO_BUILD_TESTS": "ON", "DISPENSO_BUILD_BENCHMARKS": "ON", "DISPENSO_BUILD_EXAMPLES": "ON", "CMAKE_CXX_STANDARD": "20" } }, { "name": "win-dev", "displayName": "Windows Development (Ninja)", "description": "Windows dev build using Ninja for fast single-config builds. Run from a VS Developer Command Prompt.", "inherits": "dev", "binaryDir": "$env{TEMP}/dispenso-build/${presetName}", "generator": "Ninja", "cacheVariables": { "CMAKE_BUILD_TYPE": "Release", "VCPKG_APPLOCAL_DEPS": "OFF" }, "condition": { "type": "equals", "lhs": "${hostSystemName}", "rhs": "Windows" } }, { "name": "asan", "displayName": "Address Sanitizer", "inherits": "dev", "cacheVariables": { "ADDRESS_SANITIZER": "ON" } }, { "name": "tsan", "displayName": "Thread Sanitizer", "inherits": "dev", "cacheVariables": { "THREAD_SANITIZER": "ON" } } ], "buildPresets": [ { "name": "default", "configurePreset": "default", "configuration": "Release" }, { "name": "dev", "configurePreset": "dev", "configuration": "Release" }, { "name": "win-dev", "configurePreset": "win-dev" } ], "testPresets": [ { "name": "dev", "configurePreset": "dev", "configuration": "Release", "output": { "outputOnFailure": true } }, { "name": "win-dev", "configurePreset": "win-dev", "output": { "outputOnFailure": true } } ] } ================================================ FILE: CODE_OF_CONDUCT.md ================================================ # Code of Conduct ## Our Pledge In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to make participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. ## Our Standards Examples of behavior that contributes to creating a positive environment include: * Using welcoming and inclusive language * Being respectful of differing viewpoints and experiences * Gracefully accepting constructive criticism * Focusing on what is best for the community * Showing empathy towards other community members Examples of unacceptable behavior by participants include: * The use of sexualized language or imagery and unwelcome sexual attention or advances * Trolling, insulting/derogatory comments, and personal or political attacks * Public or private harassment * Publishing others' private information, such as a physical or electronic address, without explicit permission * Other conduct which could reasonably be considered inappropriate in a professional setting ## Our Responsibilities Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. ## Scope This Code of Conduct applies within all project spaces, and it also applies when an individual is representing the project or its community in public spaces. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at . All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. ## Attribution This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html [homepage]: https://www.contributor-covenant.org For answers to common questions about this code of conduct, see https://www.contributor-covenant.org/faq ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing to dispenso We want to make contributing to this project as easy and transparent as possible. There is a design ethos behind the library, so it is recommended to reach out via a GitHub issue on the project to discuss non-trivial changes you may wish to make. These changes include, for example, wanting to change existing API, wanting to furnish a new utility, or wanting to change underlying behavior substantially. Let's avoid situations where you put in a lot of hard work, only to have to change it substantially or get your pull request rejected. ## Our Development Process This library has another home inside Facebook repos. From there it is subjected to regular continuous integration testing on many platforms, and used by many projects. ## Pull Requests We actively welcome your pull requests. 1. Fork the repo and create your branch from `master`. 2. If you've added code that should be tested, add tests. 3. If you've changed APIs, update the documentation. 4. Ensure the test suite passes. 5. Utilize clang-format. 6. If you haven't already, complete the Contributor License Agreement ("CLA"). ## Contributor License Agreement ("CLA") In order to accept your pull request, we need you to submit a CLA. You only need to do this once to work on any of Facebook's open source projects. Complete your CLA here: ## Issues We use GitHub issues to track public bugs. Please ensure your description is clear and has sufficient instructions to be able to reproduce the issue. Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe disclosure of security bugs. In those cases, please go through the process outlined on that page and do not file a public issue. ## Coding Style * 2 spaces for indentation rather than tabs * 100 character line length * Member variables have trailing underscore_ * BigCamelCase for classes and structs, and smallCamelCase for functions and variables (exception is if you are trying to match a substantial part of a standard library interface). * [1TBS braces](https://en.wikipedia.org/wiki/Indentation_style#Variant:_1TBS_(OTBS)) * Most of all, try to be consistent with the surrounding code. We have automated tools that will enforce clang-format style for some files (e.g. the C++ core) once we import your pull request into our internal code reviewing tools. ## License By contributing to dispenso, you agree that your contributions will be licensed under the LICENSE.md file in the root directory of this source tree. ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) Facebook, Inc. and its affiliates. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ [![Build and test](https://github.com/facebookincubator/dispenso/actions/workflows/build.yml/badge.svg)](https://github.com/facebookincubator/dispenso/actions/workflows/build.yml) [![Documentation](https://img.shields.io/badge/docs-online-blue)](https://facebookincubator.github.io/dispenso) [![codecov](https://codecov.io/gh/facebookincubator/dispenso/branch/main/graph/badge.svg)](https://codecov.io/gh/facebookincubator/dispenso) [![Conan Center](https://img.shields.io/conan/v/dispenso)](https://conan.io/center/recipes/dispenso) [![vcpkg](https://img.shields.io/vcpkg/v/dispenso)](https://vcpkg.io/en/package/dispenso) [![Homebrew](https://img.shields.io/homebrew/v/dispenso)](https://formulae.brew.sh/formula/dispenso) [![MacPorts](https://img.shields.io/badge/macports-dispenso-blue)](https://ports.macports.org/port/dispenso/) [![Anaconda-Server Badge](https://anaconda.org/conda-forge/dispenso/badges/version.svg)](https://anaconda.org/conda-forge/dispenso) # Dispenso **A high-performance C++ thread pool and parallel algorithms library** Dispenso is a modern **C++ parallel computing library** that provides work-stealing thread pools, parallel for loops, futures, task graphs, and concurrent containers. It serves as a powerful **alternative to OpenMP and Intel TBB**, offering better nested parallelism, sanitizer-clean code, and explicit thread pool control. Dispenso is used in hundreds of projects at Meta (formerly Facebook) and has been heavily tested and iterated on in production. **Key advantages over OpenMP and TBB:** - **No thread explosion** with nested parallel loops - dispenso's work-stealing prevents deadlocks and oversubscription - **Clean with ASAN/TSAN** - fully sanitizer-compatible, unlike many TBB versions - **Thread-safe shared futures** - `std::experimental::shared_future`-like API that TBB lacks, safe for multiple concurrent waiters, with much better performance than `std::future` - **Portable** - C++14 compatible with no compiler-specific pragmas or extensions; C++20 builds gain concept constraints for clearer error messages ## Table of Contents - [Choose Dispenso If...](#choosedispenso) - [Features](#features) - [Quick Start](#quickstart) - [Comparison vs Other Libraries](#comparison) - [Migration Guides](#migrationguides) - [When Not to Use Dispenso](#nottouse) - [Documentation and Examples](#examples) - [Benchmark Results](#benchresults) - [Installing](#installing) - [Building](#building) - [Known Issues](#knownissues) - [License](#license)
## Choose Dispenso If... - You need **nested parallelism** without thread explosion - You want **sanitizer-clean** (ASAN/TSAN) concurrent code - You want **explicit control over thread pools** rather than implicit global state - You need **compute-bound futures**, not I/O-bound async - You want **stable APIs** and minimal dependencies - You need **cross-platform portability** from a C++14 baseline - You have **multiple independent parallel loops** that can overlap (cascading `parallel_for`)
## Features Dispenso provides a comprehensive set of parallel programming primitives: **Core runtime:** * **[`ThreadPool`](https://facebookincubator.github.io/dispenso/classdispenso_1_1_thread_pool.html)** — work-stealing thread pool backing all dispenso parallelism * **[`TaskSet`](https://facebookincubator.github.io/dispenso/classdispenso_1_1_task_set.html) / [`ConcurrentTaskSet`](https://facebookincubator.github.io/dispenso/classdispenso_1_1_concurrent_task_set.html)** — task grouping with wait, cancellation, and recursive scheduling **Parallel algorithms:** * **[`parallel_for`](docs/getting_started.md#your-first-parallel-loop)** — parallel loops over indices, blocking or non-blocking (cascaded); cascading `parallel_for` enables overlapping independent loops without oversubscription * **[`for_each`](docs/getting_started.md#parallel-iteration-with-for_each)** — parallel `std::for_each` / `std::for_each_n` * **[`Future`](docs/getting_started.md#futures-for-async-results)** — high-performance thread-safe shared futures with `then()`, `when_all()`, and an API matching `std::experimental::shared_future` * **[`Graph`](docs/getting_started.md#task-graphs)** — task graph execution with subgraph support and incremental re-evaluation * **[`pipeline`](docs/getting_started.md#pipelines)** — parallel pipelining of streaming workloads **Concurrent containers and synchronization:** * **[`ConcurrentVector`](docs/getting_started.md#concurrentvector)** — concurrent growable vector, superset of TBB `concurrent_vector` API * **[`Latch`](docs/getting_started.md#latch)** — one-shot barrier for thread synchronization * **[`RWLock`](https://facebookincubator.github.io/dispenso/classdispenso_1_1_r_w_lock.html)** — reader-writer spin lock, outperforms `std::shared_mutex` under low write contention * **`SPSCRingBuffer`** — lock-free single-producer single-consumer ring buffer *(1.5.0)* **General-purpose utilities:** * **`SmallVector`** — inline-storage vector (not thread-aware; similar to `folly::small_vector`) *(1.5.0)* * **`OnceFunction`** — lightweight move-only `void()` callable * **`PoolAllocator`** — pool allocator with pluggable backing allocation (e.g. CUDA) * **`SmallBufferAllocator`** — fast concurrent allocation for temporary objects * **[`ResourcePool`](docs/getting_started.md#resource-pooling)** — semaphore-like guard around pooled resources * **`CompletionEvent`** — notifiable event with wait and timed wait * **`AsyncRequest`** — lightweight constrained message passing * **`ConcurrentObjectArena`** — fast same-type object arena
## Quick Start **Parallel for loop** - the most common use case: ```cpp #include // Sequential for (size_t i = 0; i < N; ++i) { process(data[i]); } // Parallel with dispenso - just wrap it! dispenso::parallel_for(0, N, [&](size_t i) { process(data[i]); }); ``` **Install via your favorite package manager:** ```bash # Conda conda install -c conda-forge dispenso # Fedora/RHEL sudo dnf install dispenso-devel # Or build from source (see below) ```
## Comparison vs Other Libraries ### TBB (Intel Threading Building Blocks) TBB has more functionality overall, but we built dispenso for three reasons: 1. **Sanitizer compatibility** — TBB doesn't work well with ASAN/TSAN 2. **Thread-safe shared futures** — TBB lacks a futures interface; dispenso provides `std::experimental::shared_future`-like futures safe for multiple concurrent waiters 3. **Non-Intel hardware** — we needed to control performance on diverse platforms **Performance:** Dispenso tends to be faster for small and medium parallel loops, and on par for large ones. When many loops run independently, dispenso's cascading `parallel_for` avoids oversubscription and has delivered **32-50% speedups in production workloads** after porting from TBB at Meta. TBB lacks an equivalent mechanism. See [Migrating from TBB](docs/migrating_from_tbb.md) for a step-by-step porting guide. ### OpenMP OpenMP has simple syntax for basic loops but grows complex for advanced constructs. Nested `#pragma omp parallel for` inside threaded code risks thread explosion and machine exhaustion. Dispenso outperforms OpenMP for medium and large loops. OpenMP has an advantage for very small loops due to direct compiler support, though dispenso's `minItemsPerChunk` option can close this gap by tuning the parallelism threshold for small/fast loops. See [Migrating from OpenMP](docs/migrating_from_openmp.md) for a step-by-step porting guide. ### Folly Folly excels at asynchronous I/O with coroutine support. Dispenso is designed for **compute-bound** work. Dispenso's futures are lighter-weight and faster for compute workloads; Folly is the better choice for I/O-heavy applications. ### TaskFlow TaskFlow focuses on task graph execution. Dispenso has faster graph construction, faster full and partial graph execution, much lower `parallel_for` overhead (10-100x in benchmarks), and simpler/faster pipeline construction. TaskFlow does offer CUDA graph mappings, which dispenso does not currently provide. ### Others (GCD, C++ std parallelism) GCD is Apple-specific with ports to other platforms. C++ parallel algorithms are still evolving — we are interested in enabling dispenso as a backend for `std::execution` and C++ coroutines. Contributions and benchmarks are welcome.
### Migration Guides - **[Migrating from TBB](docs/migrating_from_tbb.md)** — API mappings, thread pool differences, and common porting patterns - **[Migrating from OpenMP](docs/migrating_from_openmp.md)** — Replacing `#pragma omp` with dispenso equivalents, handling reductions and nested parallelism
## When Not to Use Dispenso Dispenso isn't really designed for high-latency task offload, it works best for compute-bound tasks. Using the thread pool for networking, disk, or in cases with frequent TLB misses (really any scenario with kernel context switches) may result in less than ideal performance. In these kernel context switch scenarios, `dispenso::Future` can be used with `dispenso::NewThreadInvoker`, which should be roughly equivalent with std::future performance. If you need async I/O, Folly is likely a good choice (though it still doesn't fix e.g. TLB misses).
## Documentation and Examples [Documentation can be found here](https://facebookincubator.github.io/dispenso) Here are some simple examples of what you can do in dispenso. See tests and benchmarks for more examples. ### parallel\_for A simple sequential loop can be parallelized with minimal changes: ```cpp for(size_t j = 0; j < kLoops; ++j) { vec[j] = someFunction(j); } ``` Becomes: ```cpp dispenso::parallel_for(0, kLoops, [&vec] (size_t j) { vec[j] = someFunction(j); }); ``` ### TaskSet Schedule multiple tasks and wait for them to complete: ```cpp void randomWorkConcurrently() { dispenso::TaskSet tasks(dispenso::globalThreadPool()); tasks.schedule([&stateA]() { stateA = doA(); }); tasks.schedule([]() { doB(); }); // Do some work on current thread tasks.wait(); // After this, A, B done. tasks.schedule(doC); tasks.schedule([&stateD]() { doD(stateD); }); } // TaskSet's destructor waits for all scheduled tasks to finish ``` ### ConcurrentTaskSet Build a tree in parallel using recursive task scheduling: ```cpp struct Node { int val; std::unique_ptr left, right; }; void buildTree(dispenso::ConcurrentTaskSet& tasks, std::unique_ptr& node, int depth) { if (depth) { node = std::make_unique(); node->val = depth; tasks.schedule([&tasks, &left = node->left, depth]() { buildTree(tasks, left, depth - 1); }); tasks.schedule([&tasks, &right = node->right, depth]() { buildTree(tasks, right, depth - 1); }); } } void buildTreeParallel() { std::unique_ptr root; dispenso::ConcurrentTaskSet tasks(dispenso::globalThreadPool()); buildTree(tasks, root, 20); tasks.wait(); // tasks would also wait here in destructor if we omitted this line } ``` ### Future Compose asynchronous operations with futures: ```cpp dispenso::Future ThingProcessor::processThings() { auto expensiveFuture = dispenso::async([this]() { return processExpensiveThing(expensive_); }); auto futureOfManyCheap = dispenso::async([this]() { size_t sum = 0; for (auto &thing : cheapThings_) { sum += processCheapThing(thing); } return sum; }); return dispenso::when_all(expensiveFuture, futureOfManyCheap).then([](auto &&tuple) { return std::get<0>(tuple).get() + std::get<1>(tuple).get(); }); } auto result = thingProc->processThings(); useResult(result.get()); ``` ### ConcurrentVector Safely grow a vector from multiple threads: ```cpp ConcurrentVector> values; dispenso::parallel_for( dispenso::makeChunkedRange(0, length, dispenso::ParForChunking::kStatic), [&values](int i, int end) { values.grow_by_generator(end - i, [i]() mutable { return std::make_unique(i++); }); }); ```
## Benchmark Results Dispenso is benchmarked across Linux (x64), macOS (ARM64), Windows (x64), and Android (ARM64), comparing against OpenMP, TBB, TaskFlow, folly, and `std::async` across thread pools, parallel loops, futures, graphs, concurrent containers, and more. **[Interactive Benchmark Dashboard](https://facebookincubator.github.io/dispenso/benchmarks/)** — explore all results with platform switching, dark/light theme, and detailed per-benchmark charts.
## Installing Binary builds of Dispenso are available through several package managers: - **Conda**: `conda install -c conda-forge dispenso` - **Conan**: `conan install --requires=dispenso/1.5.0` - **vcpkg**: `vcpkg install dispenso` - **Homebrew**: `brew install dispenso` - **MacPorts**: `sudo port install dispenso` - **Fedora/RHEL**: `sudo dnf install dispenso-devel` If your platform is not on the list, see [the next section](#building) for instructions to build from source. [![Packaging status](https://repology.org/badge/vertical-allrepos/dispenso.svg)](https://repology.org/project/dispenso/versions)
## Building **Linux and macOS:** ```bash mkdir build && cd build cmake PATH_TO_DISPENSO_ROOT make -j ``` **Windows** (from Developer Command Prompt): ```bash mkdir build && cd build cmake PATH_TO_DISPENSO_ROOT cmake --build . --config Release ``` For detailed instructions including CMake prerequisites, installation, testing, and benchmarking, see [docs/building.md](docs/building.md).
## Known Issues * A subset of dispenso tests are known to fail on 32-bit PPC Mac. If you have access to such a machine and are willing to help debug, it would be appreciated! ## TODO * Enable Windows benchmarks through CMake. *(may be resolved soon — actively being worked on)*
## License The library is released under the MIT license, but also relies on the (excellent) moodycamel concurrentqueue library, which is released under the Simplified BSD and Zlib licenses. See the top of the source at `dispenso/third-party/moodycamel/*.h` for details. ================================================ FILE: benchmarks/CMakeLists.txt ================================================ # Copyright (c) Meta Platforms, Inc. and affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. cmake_minimum_required(VERSION 3.12) # Try to find Taskflow: system install, DISPENSO_DEPS_DIR, ~/dispenso_deps, then FetchContent. find_package(Taskflow QUIET) if(NOT Taskflow_FOUND) # Check well-known local directories for a Taskflow source tree. # Searches for both "taskflow" and versioned names like "taskflow-3.11.0". if(DEFINED DISPENSO_DEPS_DIR) file(GLOB _taskflow_dep_candidates "${DISPENSO_DEPS_DIR}/taskflow*") list(APPEND _taskflow_search_dirs ${_taskflow_dep_candidates}) endif() file(GLOB _taskflow_home_candidates "$ENV{HOME}/dispenso_deps/taskflow*") list(APPEND _taskflow_search_dirs ${_taskflow_home_candidates}) set(_taskflow_local_dir "") foreach(_dir ${_taskflow_search_dirs}) if(EXISTS "${_dir}/taskflow/taskflow.hpp") set(_taskflow_local_dir "${_dir}") break() endif() endforeach() if(_taskflow_local_dir) message(STATUS "Found local Taskflow at ${_taskflow_local_dir}") add_library(taskflow INTERFACE) target_include_directories(taskflow INTERFACE "${_taskflow_local_dir}") else() message(STATUS "Taskflow not found locally, fetching from GitHub...") include(FetchContent) message(STATUS "Using up-to-date taskflow") FetchContent_Declare( taskflow GIT_REPOSITORY https://github.com/taskflow/taskflow.git GIT_TAG v3.6.0 CONFIGURE_COMMAND "" BUILD_COMMAND "" ) FetchContent_GetProperties(taskflow) if(NOT taskflow_POPULATED) FetchContent_Populate(taskflow) endif() FetchContent_MakeAvailable(taskflow) add_library(taskflow INTERFACE) target_include_directories(taskflow INTERFACE ${taskflow_SOURCE_DIR}) endif() else() message(STATUS "Found system Taskflow") if(NOT TARGET taskflow) add_library(taskflow INTERFACE) target_link_libraries(taskflow INTERFACE Taskflow::Taskflow) endif() endif() find_package(benchmark QUIET) if(NOT benchmark_FOUND) message(STATUS "Google Benchmark not found locally, fetching from GitHub...") include(FetchContent) FetchContent_Declare( benchmark GIT_REPOSITORY https://github.com/google/benchmark.git GIT_TAG v1.8.3 ) set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "Disable benchmark tests") set(BENCHMARK_ENABLE_GTEST_TESTS OFF CACHE BOOL "Disable gtest tests") FetchContent_MakeAvailable(benchmark) else() message(STATUS "Found system Google Benchmark") endif() # OpenMP support (including Windows/MSVC) find_package(OpenMP) # TBB support - prefer CONFIG mode to find TBBConfig.cmake from vcpkg/modern installs find_package(TBB CONFIG QUIET) if(NOT TBB_FOUND) # Fall back to old FindTBB.cmake module find_package(TBB QUIET) endif() find_package(folly) if (WIN32) set (REQUIRED_LIBS dispenso benchmark::benchmark benchmark::benchmark_main taskflow) else (WIN32) set (REQUIRED_LIBS dispenso benchmark::benchmark benchmark::benchmark_main pthread taskflow) endif (WIN32) if (TBB_FOUND) # Handle both old TBB (lowercase 'tbb' target) and new oneTBB (TBB::tbb target) if(TARGET TBB::tbb) set (OPTIONAL_LIBS ${OPTIONAL_LIBS} TBB::tbb) elseif(TARGET tbb) set (OPTIONAL_LIBS ${OPTIONAL_LIBS} tbb) else() set (OPTIONAL_LIBS ${OPTIONAL_LIBS} TBB::tbb) endif() else (TBB_FOUND) add_compile_definitions(BENCHMARK_WITHOUT_TBB) endif (TBB_FOUND) if (OpenMP_CXX_FOUND) set (OPTIONAL_LIBS ${OPTIONAL_LIBS} OpenMP::OpenMP_CXX) endif (OpenMP_CXX_FOUND) if (FOLLY_LIBRARIES AND NOT ${BENCHMARK_WITHOUT_FOLLY}) find_package(gflags) set (OPTIONAL_LIBS ${OPTIONAL_LIBS} ${FOLLY_LIBRARIES}) else (FOLLY_LIBRARIES AND NOT ${BENCHMARK_WITHOUT_FOLLY}) add_compile_definitions(BENCHMARK_WITHOUT_FOLLY) endif (FOLLY_LIBRARIES AND NOT ${BENCHMARK_WITHOUT_FOLLY}) file(GLOB BENCHMARK_FILES CONFIGURE_DEPENDS ${PROJECT_SOURCE_DIR}/benchmarks/*.cpp) foreach(BENCHMARK_FILE ${BENCHMARK_FILES}) set(BENCHMARK_NAME) get_filename_component(BENCHMARK_NAME ${BENCHMARK_FILE} NAME_WE) add_executable(${BENCHMARK_NAME} ${BENCHMARK_FILE}) target_compile_features(${BENCHMARK_NAME} PRIVATE cxx_std_20) target_link_libraries(${BENCHMARK_NAME} ${REQUIRED_LIBS} ${OPTIONAL_LIBS}) endforeach() if(DISPENSO_BUILD_FAST_MATH) add_subdirectory(fast_math) endif() ================================================ FILE: benchmarks/benchmark_common.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #if defined(__GNUC__) || defined(__clang__) #define UNUSED_VAR myLocalForLoopVar __attribute__((unused)) #elif defined(_MSC_VER) #define UNUSED_VAR myLocalForLoopVar __pragma(warning(suppress : 4100)) #else #define UNUSED_VAR myLocalForLoopVar #endif ================================================ FILE: benchmarks/cascading_parallel_for_benchmark.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * Benchmark demonstrating dispenso's cascading parallel_for advantage. * * When multiple independent parallel_for loops need to run, dispenso can * overlap them on a shared TaskSet using ParForOptions{.wait = false}. * TBB and OpenMP each impose an implicit barrier per parallel_for call, * forcing sequential execution of independent loops. */ #include #if defined(_OPENMP) #include #endif #include #include #if !defined(BENCHMARK_WITHOUT_TBB) #include "tbb/blocked_range.h" #include "tbb/parallel_for.h" #include "tbb/task_group.h" #include "tbb_compat.h" #endif // !BENCHMARK_WITHOUT_TBB #include "thread_benchmark_common.h" static constexpr int32_t kSmallSize = 1000; static constexpr int32_t kMediumSize = 100000; static constexpr int32_t kLargeSize = 10000000; static constexpr int32_t kNumLoops = 8; // Minimum work per chunk to amortize scheduling overhead for cheap lambdas. // With trivial compute (~4 integer ops ≈ 2ns/element), 512 elements ≈ 1µs // of work, comfortably covering task dispatch cost on Windows. static constexpr uint32_t kMinItemsPerChunk = 512; static uint32_t kSeed(42); inline int32_t compute(int32_t x) { return x * x - 3 * x + 7; } inline int32_t fuse(const std::array& values) { int32_t result = 0; for (int32_t k = 0; k < kNumLoops; ++k) { result += values[static_cast(k)]; } return result; } struct BenchArrays { std::array, kNumLoops> inputs; std::array, kNumLoops> outputs; std::vector result; }; BenchArrays& getArrays(int32_t numElements) { static std::unordered_map arrays; auto it = arrays.find(numElements); if (it != arrays.end()) { return it->second; } srand(kSeed); BenchArrays ba; for (int32_t k = 0; k < kNumLoops; ++k) { ba.inputs[static_cast(k)].reserve(static_cast(numElements)); for (int32_t i = 0; i < numElements; ++i) { ba.inputs[static_cast(k)].push_back((rand() & 255) - 127); } ba.outputs[static_cast(k)].resize(static_cast(numElements), 0); } ba.result.resize(static_cast(numElements), 0); auto res = arrays.emplace(numElements, std::move(ba)); assert(res.second); return res.first->second; } void checkResults(BenchArrays& ba, int32_t numElements) { for (int32_t i = 0; i < numElements; ++i) { auto idx = static_cast(i); std::array expected; for (int32_t k = 0; k < kNumLoops; ++k) { expected[static_cast(k)] = compute(ba.inputs[static_cast(k)][idx]); } int32_t expectedFused = fuse(expected); if (ba.result[idx] != expectedFused) { std::cerr << "FAIL at index " << i << ": got " << ba.result[idx] << " expected " << expectedFused << std::endl; abort(); } } } void BM_serial(benchmark::State& state) { const int32_t numElements = state.range(0); auto& ba = getArrays(numElements); for (auto UNUSED_VAR : state) { for (int32_t k = 0; k < kNumLoops; ++k) { auto kk = static_cast(k); for (int32_t i = 0; i < numElements; ++i) { ba.outputs[kk][static_cast(i)] = compute(ba.inputs[kk][static_cast(i)]); } } for (int32_t i = 0; i < numElements; ++i) { auto idx = static_cast(i); std::array vals; for (int32_t k = 0; k < kNumLoops; ++k) { vals[static_cast(k)] = ba.outputs[static_cast(k)][idx]; } ba.result[idx] = fuse(vals); } } checkResults(ba, numElements); } void BM_dispenso_blocking(benchmark::State& state) { const int32_t numThreads = state.range(0) - 1; const int32_t numElements = state.range(1); auto& ba = getArrays(numElements); dispenso::ThreadPool pool(numThreads); dispenso::ParForOptions opts; opts.minItemsPerChunk = kMinItemsPerChunk; for (auto UNUSED_VAR : state) { dispenso::TaskSet tasks(pool); for (int32_t k = 0; k < kNumLoops; ++k) { auto kk = static_cast(k); dispenso::parallel_for( tasks, 0, numElements, [&inputs = ba.inputs[kk], &outputs = ba.outputs[kk]](int32_t i) { outputs[static_cast(i)] = compute(inputs[static_cast(i)]); }, opts); } dispenso::parallel_for( tasks, 0, numElements, [&ba](int32_t i) { auto idx = static_cast(i); std::array vals; for (int32_t k = 0; k < kNumLoops; ++k) { vals[static_cast(k)] = ba.outputs[static_cast(k)][idx]; } ba.result[idx] = fuse(vals); }, opts); } checkResults(ba, numElements); } void BM_dispenso_cascaded(benchmark::State& state) { const int32_t numThreads = state.range(0) - 1; const int32_t numElements = state.range(1); auto& ba = getArrays(numElements); dispenso::ThreadPool pool(numThreads); dispenso::ParForOptions noWait; noWait.wait = false; noWait.minItemsPerChunk = kMinItemsPerChunk; dispenso::ParForOptions opts; opts.minItemsPerChunk = kMinItemsPerChunk; for (auto UNUSED_VAR : state) { dispenso::TaskSet tasks(pool); // First N-1 loops: non-blocking, returns immediately for (int32_t k = 0; k < kNumLoops - 1; ++k) { auto kk = static_cast(k); dispenso::parallel_for( tasks, 0, numElements, [&inputs = ba.inputs[kk], &outputs = ba.outputs[kk]](int32_t i) { outputs[static_cast(i)] = compute(inputs[static_cast(i)]); }, noWait); } // Last independent loop: blocking — calling thread participates, // implicitly waits for all prior non-blocking loops too { constexpr auto kk = static_cast(kNumLoops - 1); dispenso::parallel_for( tasks, 0, numElements, [&inputs = ba.inputs[kk], &outputs = ba.outputs[kk]](int32_t i) { outputs[static_cast(i)] = compute(inputs[static_cast(i)]); }, opts); } // Fusion: blocking (depends on all outputs being complete) dispenso::parallel_for( tasks, 0, numElements, [&ba](int32_t i) { auto idx = static_cast(i); std::array vals; for (int32_t k = 0; k < kNumLoops; ++k) { vals[static_cast(k)] = ba.outputs[static_cast(k)][idx]; } ba.result[idx] = fuse(vals); }, opts); } checkResults(ba, numElements); } #if defined(_OPENMP) void BM_omp(benchmark::State& state) { const int32_t numThreads = state.range(0); const int32_t numElements = state.range(1); auto& ba = getArrays(numElements); omp_set_num_threads(numThreads); for (auto UNUSED_VAR : state) { for (int32_t k = 0; k < kNumLoops; ++k) { auto kk = static_cast(k); #pragma omp parallel for for (int32_t i = 0; i < numElements; ++i) { ba.outputs[kk][static_cast(i)] = compute(ba.inputs[kk][static_cast(i)]); } } #pragma omp parallel for for (int32_t i = 0; i < numElements; ++i) { auto idx = static_cast(i); std::array vals; for (int32_t k = 0; k < kNumLoops; ++k) { vals[static_cast(k)] = ba.outputs[static_cast(k)][idx]; } ba.result[idx] = fuse(vals); } } checkResults(ba, numElements); } #endif /*defined(_OPENMP)*/ #if !defined(BENCHMARK_WITHOUT_TBB) void BM_tbb(benchmark::State& state) { const int32_t numThreads = state.range(0); const int32_t numElements = state.range(1); auto& ba = getArrays(numElements); for (auto UNUSED_VAR : state) { tbb_compat::task_scheduler_init initsched(numThreads); for (int32_t k = 0; k < kNumLoops; ++k) { auto kk = static_cast(k); tbb::parallel_for( tbb::blocked_range(0, numElements), [&inputs = ba.inputs[kk], &outputs = ba.outputs[kk]](const tbb::blocked_range& r) { for (int32_t i = r.begin(); i != r.end(); ++i) { outputs[static_cast(i)] = compute(inputs[static_cast(i)]); } }); } tbb::parallel_for( tbb::blocked_range(0, numElements), [&ba](const tbb::blocked_range& r) { for (int32_t i = r.begin(); i != r.end(); ++i) { auto idx = static_cast(i); std::array vals; for (int32_t k = 0; k < kNumLoops; ++k) { vals[static_cast(k)] = ba.outputs[static_cast(k)][idx]; } ba.result[idx] = fuse(vals); } }); } checkResults(ba, numElements); } // TBB with task_group: launch all independent parallel_for loops concurrently // via task_group, then wait — emulating dispenso's cascading behavior. void BM_tbb_task_group(benchmark::State& state) { const int32_t numThreads = state.range(0); const int32_t numElements = state.range(1); auto& ba = getArrays(numElements); for (auto UNUSED_VAR : state) { tbb_compat::task_scheduler_init initsched(numThreads); tbb::task_group tg; for (int32_t k = 0; k < kNumLoops; ++k) { auto kk = static_cast(k); tg.run([&inputs = ba.inputs[kk], &outputs = ba.outputs[kk], numElements]() { tbb::parallel_for( tbb::blocked_range(0, numElements), [&inputs, &outputs](const tbb::blocked_range& r) { for (int32_t i = r.begin(); i != r.end(); ++i) { outputs[static_cast(i)] = compute(inputs[static_cast(i)]); } }); }); } tg.wait(); tbb::parallel_for( tbb::blocked_range(0, numElements), [&ba](const tbb::blocked_range& r) { for (int32_t i = r.begin(); i != r.end(); ++i) { auto idx = static_cast(i); std::array vals; for (int32_t k = 0; k < kNumLoops; ++k) { vals[static_cast(k)] = ba.outputs[static_cast(k)][idx]; } ba.result[idx] = fuse(vals); } }); } checkResults(ba, numElements); } #endif // !BENCHMARK_WITHOUT_TBB static void CustomArguments(benchmark::internal::Benchmark* b) { for (int j : {kSmallSize, kMediumSize, kLargeSize}) { for (int i : pow2HalfStepThreads()) { b->Args({i, j}); } } } BENCHMARK(BM_serial)->Args({kSmallSize})->Args({kMediumSize})->Args({kLargeSize})->UseRealTime(); #if defined(_OPENMP) BENCHMARK(BM_omp)->Apply(CustomArguments)->UseRealTime(); #endif // OPENMP #if !defined(BENCHMARK_WITHOUT_TBB) BENCHMARK(BM_tbb)->Apply(CustomArguments)->UseRealTime(); BENCHMARK(BM_tbb_task_group)->Apply(CustomArguments)->UseRealTime(); #endif // !BENCHMARK_WITHOUT_TBB BENCHMARK(BM_dispenso_blocking)->Apply(CustomArguments)->UseRealTime(); BENCHMARK(BM_dispenso_cascaded)->Apply(CustomArguments)->UseRealTime(); BENCHMARK_MAIN(); ================================================ FILE: benchmarks/concurrent_vector_benchmark.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #if !defined(BENCHMARK_WITHOUT_TBB) #include "tbb/concurrent_vector.h" #endif // !BENCHMARK_WITHOUT_TBB #include #include #include "thread_benchmark_common.h" constexpr size_t kLength = (1 << 20); void checkIotaSum(int64_t sum) { if (sum != (static_cast(kLength - 1) * kLength) / 2) { std::cout << sum << " vs " << ((kLength - 1) * kLength) / 2 << std::endl; std::abort(); } } template void checkIotaSum(const Cont& c, int64_t sum) { if (sum != (static_cast(kLength - 1) * kLength) / 2) { std::cout << sum << " vs " << ((kLength - 1) * kLength) / 2 << std::endl; std::vector accountedFor(kLength); for (auto v : c) { accountedFor[v] = 1; } for (size_t i = 0; i < kLength; ++i) { if (!accountedFor[i]) { std::cout << "missing " << i << std::endl; } } std::abort(); } } template void pushBackImpl(benchmark::State& state, ContainerInit containerInit) { for (auto UNUSED_VAR : state) { auto values = containerInit(); for (size_t i = 0; i < kLength; ++i) { values.push_back(i); } } } #if !defined(BENCHMARK_WITHOUT_TBB) template void pushBackGrowByAlternativeTbb(benchmark::State& state, ContainerInit containerInit) { for (auto UNUSED_VAR : state) { auto values = containerInit(); auto it = values.grow_by(kLength); auto end = values.end(); size_t i = 0; for (; it != end; ++it) { *it = i++; } } } #endif // !BENCHMARK_WITHOUT_TBB template void pushBackGrowByAlternativeDispenso(benchmark::State& state, ContainerInit containerInit) { for (auto UNUSED_VAR : state) { auto values = containerInit(); values.grow_by_generator(kLength, [i = size_t{0}]() mutable { return i++; }); } } void BM_std_push_back_serial(benchmark::State& state) { pushBackImpl(state, []() { return std::vector(); }); } void BM_deque_push_back_serial(benchmark::State& state) { pushBackImpl(state, []() { return std::deque(); }); } #if !defined(BENCHMARK_WITHOUT_TBB) void BM_tbb_push_back_serial(benchmark::State& state) { pushBackImpl(state, []() { return tbb::concurrent_vector(); }); } #endif // !BENCHMARK_WITHOUT_TBB void BM_dispenso_push_back_serial(benchmark::State& state) { pushBackImpl(state, []() { return dispenso::ConcurrentVector(); }); } #if !defined(BENCHMARK_WITHOUT_TBB) void BM_tbb_push_back_serial_grow_by_alternative(benchmark::State& state) { pushBackGrowByAlternativeTbb(state, []() { return tbb::concurrent_vector(); }); } #endif // !BENCHMARK_WITHOUT_TBB void BM_dispenso_push_back_serial_grow_by_alternative(benchmark::State& state) { pushBackGrowByAlternativeDispenso(state, []() { return dispenso::ConcurrentVector(); }); } void BM_std_push_back_serial_reserve(benchmark::State& state) { pushBackImpl(state, []() { std::vector v; v.reserve(kLength); return v; }); } #if !defined(BENCHMARK_WITHOUT_TBB) void BM_tbb_push_back_serial_reserve(benchmark::State& state) { pushBackImpl(state, []() { tbb::concurrent_vector v; v.reserve(kLength); return v; }); } #endif // !BENCHMARK_WITHOUT_TBB void BM_dispenso_push_back_serial_reserve(benchmark::State& state) { pushBackImpl( state, []() { return dispenso::ConcurrentVector(kLength, dispenso::ReserveTag); }); } #if !defined(BENCHMARK_WITHOUT_TBB) void BM_tbb_push_back_serial_grow_by_alternative_reserve(benchmark::State& state) { pushBackGrowByAlternativeTbb(state, []() { tbb::concurrent_vector v; v.reserve(kLength); return v; }); } #endif // !BENCHMARK_WITHOUT_TBB void BM_dispenso_push_back_serial_grow_by_alternative_reserve(benchmark::State& state) { pushBackGrowByAlternativeDispenso( state, []() { return dispenso::ConcurrentVector(kLength, dispenso::ReserveTag); }); } template void iterateImpl(benchmark::State& state, ContainerInit containerInit) { auto values = containerInit(); for (size_t i = 0; i < kLength; ++i) { values.push_back(i); } int64_t sum; for (auto UNUSED_VAR : state) { sum = 0; for (auto i : values) { sum += i; } } checkIotaSum(sum); } void BM_std_iterate(benchmark::State& state) { iterateImpl(state, []() { return std::vector(); }); } void BM_deque_iterate(benchmark::State& state) { iterateImpl(state, []() { return std::deque(); }); } #if !defined(BENCHMARK_WITHOUT_TBB) void BM_tbb_iterate(benchmark::State& state) { iterateImpl(state, []() { return tbb::concurrent_vector(); }); } #endif // !BENCHMARK_WITHOUT_TBB void BM_dispenso_iterate(benchmark::State& state) { iterateImpl(state, []() { return dispenso::ConcurrentVector(); }); } template struct ReverseWrapper { T& iterable; }; template auto begin(ReverseWrapper w) { return std::rbegin(w.iterable); } template auto end(ReverseWrapper w) { return std::rend(w.iterable); } template ReverseWrapper reverse(T&& iterable) { return {iterable}; } template void iterateReverseImpl(benchmark::State& state, ContainerInit containerInit) { auto values = containerInit(); for (size_t i = 0; i < kLength; ++i) { values.push_back(i); } int64_t sum; for (auto UNUSED_VAR : state) { sum = 0; for (auto i : reverse(values)) { sum += i; } } checkIotaSum(sum); } void BM_std_iterate_reverse(benchmark::State& state) { iterateReverseImpl(state, []() { return std::vector(); }); } void BM_deque_iterate_reverse(benchmark::State& state) { iterateReverseImpl(state, []() { return std::deque(); }); } #if !defined(BENCHMARK_WITHOUT_TBB) void BM_tbb_iterate_reverse(benchmark::State& state) { iterateReverseImpl(state, []() { return tbb::concurrent_vector(); }); } #endif // !BENCHMARK_WITHOUT_TBB void BM_dispenso_iterate_reverse(benchmark::State& state) { iterateReverseImpl(state, []() { return dispenso::ConcurrentVector(); }); } template void lowerBoundImpl(benchmark::State& state, ContainerInit containerInit) { auto values = containerInit(); for (size_t i = 0; i < kLength; ++i) { values.push_back(i); } int64_t sum; for (auto UNUSED_VAR : state) { sum = 0; for (size_t i = 0; i < kLength; ++i) { sum += std::lower_bound(std::begin(values), std::end(values), i) - std::begin(values); } } checkIotaSum(sum); } void BM_std_lower_bound(benchmark::State& state) { lowerBoundImpl(state, []() { return std::vector(); }); } void BM_deque_lower_bound(benchmark::State& state) { lowerBoundImpl(state, []() { return std::deque(); }); } #if !defined(BENCHMARK_WITHOUT_TBB) void BM_tbb_lower_bound(benchmark::State& state) { lowerBoundImpl(state, []() { return tbb::concurrent_vector(); }); } #endif // !BENCHMARK_WITHOUT_TBB void BM_dispenso_lower_bound(benchmark::State& state) { lowerBoundImpl(state, []() { return dispenso::ConcurrentVector(); }); } template void indexImpl(benchmark::State& state, ContainerInit containerInit) { auto values = containerInit(); for (size_t i = 0; i < kLength; ++i) { values.push_back(i); } int64_t sum; for (auto UNUSED_VAR : state) { sum = 0; size_t len = values.size(); for (size_t i = 0; i < len; ++i) { sum += values[i]; } } checkIotaSum(sum); } void BM_std_index(benchmark::State& state) { indexImpl(state, []() { return std::vector(); }); } void BM_deque_index(benchmark::State& state) { indexImpl(state, []() { return std::deque(); }); } #if !defined(BENCHMARK_WITHOUT_TBB) void BM_tbb_index(benchmark::State& state) { indexImpl(state, []() { return tbb::concurrent_vector(); }); } #endif // !BENCHMARK_WITHOUT_TBB void BM_dispenso_index(benchmark::State& state) { indexImpl(state, []() { return dispenso::ConcurrentVector(); }); } template void randomImpl(benchmark::State& state, ContainerInit containerInit) { auto values = containerInit(); std::vector indices; for (size_t i = 0; i < kLength; ++i) { values.push_back(i); indices.push_back(i); } // Make this repeatable. std::mt19937 rng(27); std::shuffle(indices.begin(), indices.end(), rng); int64_t sum; for (auto UNUSED_VAR : state) { sum = 0; for (auto i : indices) { sum += values[i]; } } checkIotaSum(sum); } void BM_std_random(benchmark::State& state) { randomImpl(state, []() { return std::vector(); }); } void BM_deque_random(benchmark::State& state) { randomImpl(state, []() { return std::deque(); }); } #if !defined(BENCHMARK_WITHOUT_TBB) void BM_tbb_random(benchmark::State& state) { randomImpl(state, []() { return tbb::concurrent_vector(); }); } #endif // !BENCHMARK_WITHOUT_TBB void BM_dispenso_random(benchmark::State& state) { randomImpl(state, []() { return dispenso::ConcurrentVector(); }); } template void parallelImpl( benchmark::State& state, ContainerInit containerInit, ContainerPush containerPush) { for (auto UNUSED_VAR : state) { auto values = containerInit(); dispenso::parallel_for( 0, kLength, [&values, containerPush](size_t i) { containerPush(values, i); }); } } void BM_std_parallel(benchmark::State& state) { std::mutex mtx; parallelImpl( state, []() { return std::vector(); }, [&mtx](std::vector& c, int i) { std::lock_guard lk(mtx); c.push_back(i); }); } void BM_deque_parallel(benchmark::State& state) { std::mutex mtx; parallelImpl( state, []() { return std::deque(); }, [&mtx](std::deque& c, int i) { std::lock_guard lk(mtx); c.push_back(i); }); } #if !defined(BENCHMARK_WITHOUT_TBB) void BM_tbb_parallel(benchmark::State& state) { parallelImpl( state, []() { return tbb::concurrent_vector(); }, [](tbb::concurrent_vector& c, int i) { c.push_back(i); }); } #endif // !BENCHMARK_WITHOUT_TBB void BM_dispenso_parallel(benchmark::State& state) { parallelImpl( state, []() { return dispenso::ConcurrentVector(); }, [](dispenso::ConcurrentVector& c, int i) { c.push_back(i); }); } void BM_std_parallel_reserve(benchmark::State& state) { std::mutex mtx; parallelImpl( state, []() { std::vector v; v.reserve(kLength); return v; }, [&mtx](std::vector& c, int i) { std::lock_guard lk(mtx); c.push_back(i); }); } #if !defined(BENCHMARK_WITHOUT_TBB) void BM_tbb_parallel_reserve(benchmark::State& state) { parallelImpl( state, []() { tbb::concurrent_vector v; v.reserve(kLength); return v; }, [](tbb::concurrent_vector& c, int i) { c.push_back(i); }); } #endif // !BENCHMARK_WITHOUT_TBB void BM_dispenso_parallel_reserve(benchmark::State& state) { parallelImpl( state, []() { return dispenso::ConcurrentVector(kLength, dispenso::ReserveTag); }, [](dispenso::ConcurrentVector& c, int i) { c.push_back(i); }); } template void parallelImplClear( benchmark::State& state, ContainerInit containerInit, ContainerPush containerPush) { auto values = containerInit(); for (auto UNUSED_VAR : state) { values.clear(); dispenso::parallel_for( 0, kLength, [&values, containerPush](size_t i) { containerPush(values, i); }); } int64_t sum = 0; for (auto i : values) { sum += i; } checkIotaSum(sum); } void BM_std_parallel_clear(benchmark::State& state) { std::mutex mtx; parallelImplClear( state, []() { return std::vector(); }, [&mtx](std::vector& c, int i) { std::lock_guard lk(mtx); c.push_back(i); }); } void BM_deque_parallel_clear(benchmark::State& state) { std::mutex mtx; parallelImplClear( state, []() { return std::deque(); }, [&mtx](std::deque& c, int i) { std::lock_guard lk(mtx); c.push_back(i); }); } #if !defined(BENCHMARK_WITHOUT_TBB) void BM_tbb_parallel_clear(benchmark::State& state) { parallelImplClear( state, []() { return tbb::concurrent_vector(); }, [](tbb::concurrent_vector& c, int i) { c.push_back(i); }); } #endif // !BENCHMARK_WITHOUT_TBB void BM_dispenso_parallel_clear(benchmark::State& state) { parallelImplClear( state, []() { return dispenso::ConcurrentVector(); }, [](dispenso::ConcurrentVector& c, int i) { c.push_back(i); }); } template void parallelImplGrowBy( size_t growBy, benchmark::State& state, ContainerInit containerInit, ContainerPush containerPush) { auto values = containerInit(); for (auto UNUSED_VAR : state) { values.clear(); dispenso::parallel_for( dispenso::makeChunkedRange(0, kLength, dispenso::ParForChunking::kStatic), [&values, containerPush, growBy](size_t i, size_t end) { while (i + growBy <= end) { containerPush(values, i, i + growBy); i += growBy; } containerPush(values, i, end); }); } int64_t sum = 0; for (auto i : values) { sum += i; } checkIotaSum(values, sum); } void BM_std_parallel_grow_by_10(benchmark::State& state) { std::mutex mtx; parallelImplGrowBy( 10, state, []() { return std::vector(); }, [&mtx](std::vector& c, int i, int end) { std::lock_guard lk(mtx); for (; i != end; ++i) { c.push_back(i); } }); } void BM_deque_parallel_grow_by_10(benchmark::State& state) { std::mutex mtx; parallelImplGrowBy( 10, state, []() { return std::deque(); }, [&mtx](std::deque& c, int i, int end) { std::lock_guard lk(mtx); for (; i != end; ++i) { c.push_back(i); } }); } #if !defined(BENCHMARK_WITHOUT_TBB) void BM_tbb_parallel_grow_by_10(benchmark::State& state) { parallelImplGrowBy( 10, state, []() { return tbb::concurrent_vector(); }, [](tbb::concurrent_vector& c, int i, int end) { auto it = c.grow_by(end - i); for (; i != end; ++i, ++it) { *it = i; } }); } #endif // !BENCHMARK_WITHOUT_TBB void BM_dispenso_parallel_grow_by_10(benchmark::State& state) { parallelImplGrowBy( 10, state, []() { return dispenso::ConcurrentVector(); }, [](dispenso::ConcurrentVector& c, int i, int end) { c.grow_by_generator(end - i, [i]() mutable { return i++; }); }); } void BM_std_parallel_grow_by_100(benchmark::State& state) { std::mutex mtx; parallelImplGrowBy( 100, state, []() { return std::vector(); }, [&mtx](std::vector& c, int i, int end) { std::lock_guard lk(mtx); for (; i != end; ++i) { c.push_back(i); } }); } void BM_deque_parallel_grow_by_100(benchmark::State& state) { std::mutex mtx; parallelImplGrowBy( 100, state, []() { return std::deque(); }, [&mtx](std::deque& c, int i, int end) { std::lock_guard lk(mtx); for (; i != end; ++i) { c.push_back(i); } }); } #if !defined(BENCHMARK_WITHOUT_TBB) void BM_tbb_parallel_grow_by_100(benchmark::State& state) { parallelImplGrowBy( 100, state, []() { return tbb::concurrent_vector(); }, [](tbb::concurrent_vector& c, int i, int end) { auto it = c.grow_by(end - i); for (; i != end; ++i, ++it) { *it = i; } }); } #endif // !BENCHMARK_WITHOUT_TBB void BM_dispenso_parallel_grow_by_100(benchmark::State& state) { parallelImplGrowBy( 100, state, []() { return dispenso::ConcurrentVector(); }, [](dispenso::ConcurrentVector& c, int i, int end) { c.grow_by_generator(end - i, [i]() mutable { return i++; }); }); } template void parallelImplGrowByMax( benchmark::State& state, ContainerInit containerInit, ContainerPush containerPush) { auto values = containerInit(); for (auto UNUSED_VAR : state) { values.clear(); dispenso::parallel_for( dispenso::makeChunkedRange(0, kLength, dispenso::ParForChunking::kStatic), [&values, containerPush](size_t i, size_t end) { containerPush(values, i, end); }); } int64_t sum = 0; for (auto i : values) { sum += i; } checkIotaSum(sum); } void BM_std_parallel_grow_by_max(benchmark::State& state) { std::mutex mtx; parallelImplGrowByMax( state, []() { return std::vector(); }, [&mtx](std::vector& c, int i, int end) { std::lock_guard lk(mtx); for (; i != end; ++i) { c.push_back(i); } }); } void BM_deque_parallel_grow_by_max(benchmark::State& state) { std::mutex mtx; parallelImplGrowByMax( state, []() { return std::deque(); }, [&mtx](std::deque& c, int i, int end) { std::lock_guard lk(mtx); for (; i != end; ++i) { c.push_back(i); } }); } #if !defined(BENCHMARK_WITHOUT_TBB) void BM_tbb_parallel_grow_by_max(benchmark::State& state) { parallelImplGrowByMax( state, []() { return tbb::concurrent_vector(); }, [](tbb::concurrent_vector& c, int i, int end) { auto it = c.grow_by(end - i); for (; i != end; ++i, ++it) { *it = i; } }); } #endif // !BENCHMARK_WITHOUT_TBB void BM_dispenso_parallel_grow_by_max(benchmark::State& state) { parallelImplGrowByMax( state, []() { return dispenso::ConcurrentVector(); }, [](dispenso::ConcurrentVector& c, int i, int end) { c.grow_by_generator(end - i, [i]() mutable { return i++; }); }); } BENCHMARK(BM_std_push_back_serial); BENCHMARK(BM_deque_push_back_serial); #if !defined(BENCHMARK_WITHOUT_TBB) BENCHMARK(BM_tbb_push_back_serial); #endif // !BENCHMARK_WITHOUT_TBB BENCHMARK(BM_dispenso_push_back_serial); #if !defined(BENCHMARK_WITHOUT_TBB) BENCHMARK(BM_tbb_push_back_serial_grow_by_alternative); #endif // !BENCHMARK_WITHOUT_TBB BENCHMARK(BM_dispenso_push_back_serial_grow_by_alternative); BENCHMARK(BM_std_push_back_serial_reserve); #if !defined(BENCHMARK_WITHOUT_TBB) BENCHMARK(BM_tbb_push_back_serial_reserve); #endif // !BENCHMARK_WITHOUT_TBB BENCHMARK(BM_dispenso_push_back_serial_reserve); #if !defined(BENCHMARK_WITHOUT_TBB) BENCHMARK(BM_tbb_push_back_serial_grow_by_alternative_reserve); #endif // !BENCHMARK_WITHOUT_TBB BENCHMARK(BM_dispenso_push_back_serial_grow_by_alternative_reserve); BENCHMARK(BM_std_iterate); BENCHMARK(BM_deque_iterate); #if !defined(BENCHMARK_WITHOUT_TBB) BENCHMARK(BM_tbb_iterate); #endif // !BENCHMARK_WITHOUT_TBB BENCHMARK(BM_dispenso_iterate); BENCHMARK(BM_std_iterate_reverse); BENCHMARK(BM_deque_iterate_reverse); #if !defined(BENCHMARK_WITHOUT_TBB) BENCHMARK(BM_tbb_iterate_reverse); #endif // !BENCHMARK_WITHOUT_TBB BENCHMARK(BM_dispenso_iterate_reverse); BENCHMARK(BM_std_lower_bound); BENCHMARK(BM_deque_lower_bound); #if !defined(BENCHMARK_WITHOUT_TBB) BENCHMARK(BM_tbb_lower_bound); #endif // !BENCHMARK_WITHOUT_TBB BENCHMARK(BM_dispenso_lower_bound); BENCHMARK(BM_std_index); BENCHMARK(BM_deque_index); #if !defined(BENCHMARK_WITHOUT_TBB) BENCHMARK(BM_tbb_index); #endif // !BENCHMARK_WITHOUT_TBB BENCHMARK(BM_dispenso_index); BENCHMARK(BM_std_random); BENCHMARK(BM_deque_random); #if !defined(BENCHMARK_WITHOUT_TBB) BENCHMARK(BM_tbb_random); #endif // !BENCHMARK_WITHOUT_TBB BENCHMARK(BM_dispenso_random); BENCHMARK(BM_std_parallel); BENCHMARK(BM_deque_parallel); #if !defined(BENCHMARK_WITHOUT_TBB) BENCHMARK(BM_tbb_parallel); #endif // !BENCHMARK_WITHOUT_TBB BENCHMARK(BM_dispenso_parallel); BENCHMARK(BM_std_parallel_reserve); #if !defined(BENCHMARK_WITHOUT_TBB) BENCHMARK(BM_tbb_parallel_reserve); #endif // !BENCHMARK_WITHOUT_TBB BENCHMARK(BM_dispenso_parallel_reserve); BENCHMARK(BM_std_parallel_clear); BENCHMARK(BM_deque_parallel_clear); #if !defined(BENCHMARK_WITHOUT_TBB) BENCHMARK(BM_tbb_parallel_clear); #endif // !BENCHMARK_WITHOUT_TBB BENCHMARK(BM_dispenso_parallel_clear); BENCHMARK(BM_std_parallel_grow_by_10); BENCHMARK(BM_deque_parallel_grow_by_10); #if !defined(BENCHMARK_WITHOUT_TBB) BENCHMARK(BM_tbb_parallel_grow_by_10); #endif // !BENCHMARK_WITHOUT_TBB BENCHMARK(BM_dispenso_parallel_grow_by_10); BENCHMARK(BM_std_parallel_grow_by_100); BENCHMARK(BM_deque_parallel_grow_by_100); #if !defined(BENCHMARK_WITHOUT_TBB) BENCHMARK(BM_tbb_parallel_grow_by_100); #endif // !BENCHMARK_WITHOUT_TBB BENCHMARK(BM_dispenso_parallel_grow_by_100); BENCHMARK(BM_std_parallel_grow_by_max); BENCHMARK(BM_deque_parallel_grow_by_max); #if !defined(BENCHMARK_WITHOUT_TBB) BENCHMARK(BM_tbb_parallel_grow_by_max); #endif // !BENCHMARK_WITHOUT_TBB BENCHMARK(BM_dispenso_parallel_grow_by_max); BENCHMARK_MAIN(); ================================================ FILE: benchmarks/fast_math/CMakeLists.txt ================================================ # Copyright (c) Meta Platforms, Inc. and affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. cmake_minimum_required(VERSION 3.12) # Apply SIMD ISA flags and Highway linkage from parent. set(FAST_MATH_EXTRA_FLAGS ${DISPENSO_FAST_MATH_SIMD_FLAGS}) set(FAST_MATH_EXTRA_LIBS "") if(DISPENSO_FAST_MATH_HIGHWAY AND TARGET hwy) list(APPEND FAST_MATH_EXTRA_LIBS hwy) # Treat Highway as a system include to suppress warnings from its headers. if(MSVC) list(APPEND FAST_MATH_EXTRA_FLAGS /external:I ${hwy_SOURCE_DIR}) else() list(APPEND FAST_MATH_EXTRA_FLAGS -isystem${hwy_SOURCE_DIR}) endif() endif() file(GLOB FAST_MATH_BENCHMARK_FILES CONFIGURE_DEPENDS *.cpp) foreach(BENCHMARK_FILE ${FAST_MATH_BENCHMARK_FILES}) set(BENCHMARK_NAME) get_filename_component(BENCHMARK_NAME ${BENCHMARK_FILE} NAME_WE) set(BENCHMARK_NAME "fast_math_${BENCHMARK_NAME}") add_executable(${BENCHMARK_NAME} ${BENCHMARK_FILE}) target_compile_features(${BENCHMARK_NAME} PRIVATE cxx_std_17) target_compile_options(${BENCHMARK_NAME} PRIVATE ${FAST_MATH_EXTRA_FLAGS}) target_link_libraries(${BENCHMARK_NAME} ${REQUIRED_LIBS} ${OPTIONAL_LIBS} ${FAST_MATH_EXTRA_LIBS}) endforeach() ================================================ FILE: benchmarks/fast_math/avx512_benchmarks.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include "benchmark_helpers.h" #if defined(__AVX512F__) namespace dfm = dispenso::fast_math; namespace bench = dispenso::fast_math::bench; using Flt = __m512; // --- One-arg benchmarks --- void BM_sin_avx512(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](auto x) { return dfm::sin(x); }); } void BM_cos_avx512(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](auto x) { return dfm::cos(x); }); } void BM_tan_avx512(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](auto x) { return dfm::tan(x); }); } void BM_atan_avx512(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](auto x) { return dfm::atan(x); }); } void BM_acos_avx512(benchmark::State& state) { bench::runBench(state, bench::acosInputs(), [](auto x) { return dfm::acos(x); }); } void BM_asin_avx512(benchmark::State& state) { bench::runBench(state, bench::acosInputs(), [](auto x) { return dfm::asin(x); }); } void BM_exp_avx512(benchmark::State& state) { bench::runBench(state, bench::expInputs(), [](auto x) { return dfm::exp(x); }); } void BM_exp2_avx512(benchmark::State& state) { bench::runBench(state, bench::expInputs(), [](auto x) { return dfm::exp2(x); }); } void BM_exp10_avx512(benchmark::State& state) { bench::runBench(state, bench::expInputs(), [](auto x) { return dfm::exp10(x); }); } void BM_expm1_avx512(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](auto x) { return dfm::expm1(x); }); } void BM_log_avx512(benchmark::State& state) { bench::runBench(state, bench::logInputs(), [](auto x) { return dfm::log(x); }); } void BM_log2_avx512(benchmark::State& state) { bench::runBench(state, bench::logInputs(), [](auto x) { return dfm::log2(x); }); } void BM_log10_avx512(benchmark::State& state) { bench::runBench(state, bench::logInputs(), [](auto x) { return dfm::log10(x); }); } void BM_log1p_avx512(benchmark::State& state) { bench::runBench( state, bench::sinInputs(), [](Flt x) { return dfm::log1p(_mm512_abs_ps(x)); }); } void BM_cbrt_avx512(benchmark::State& state) { bench::runBench(state, bench::logInputs(), [](auto x) { return dfm::cbrt(x); }); } void BM_frexp_avx512(benchmark::State& state) { bench::runBench(state, bench::logInputs(), [](auto x) { dfm::IntType_t e; return dfm::frexp(x, &e); }); } void BM_ldexp_avx512(benchmark::State& state) { bench::runBench( state, bench::logInputs(), [](auto x) { return dfm::ldexp(x, _mm512_set1_epi32(3)); }); } void BM_tanh_avx512(benchmark::State& state) { bench::runBench(state, bench::tanhInputs(), [](auto x) { return dfm::tanh(x); }); } void BM_erf_avx512(benchmark::State& state) { bench::runBench(state, bench::erfInputs(), [](auto x) { return dfm::erf(x); }); } // --- Two-arg benchmarks --- void BM_atan2_avx512(benchmark::State& state) { bench::runBench2(state, bench::expInputs(), bench::sinInputs(), [](auto y, auto x) { return dfm::atan2(y, x); }); } void BM_hypot_avx512(benchmark::State& state) { bench::runBench2(state, bench::hypotInputs(), bench::sinInputs(), [](auto x, auto y) { return dfm::hypot(x, y); }); } void BM_hypot_avx512_bounds(benchmark::State& state) { bench::runBench2(state, bench::hypotInputs(), bench::sinInputs(), [](auto x, auto y) { return dfm::hypot(x, y); }); } void BM_pow_avx512(benchmark::State& state) { bench::runBench2( state, bench::powBaseInputs(), bench::powExpInputs(), [](auto b, auto e) { return dfm::pow(b, e); }); } void BM_pow_avx512_accurate(benchmark::State& state) { bench::runBench2( state, bench::powBaseInputs(), bench::powExpInputs(), [](auto b, auto e) { return dfm::pow(b, e); }); } void BM_pow_avx512_scalar_exp(benchmark::State& state) { bench::runBench(state, bench::powBaseInputs(), [](auto x) { return dfm::pow(x, 2.5f); }); } // --- Libc-packed comparisons (AVX-512-specific, kept hand-written) --- void BM_hypot_libc_avx512(benchmark::State& state) { const auto& inputs = bench::hypotInputs(); const auto& inputs2 = bench::sinInputs(); size_t idx = 0; Flt sum = _mm512_setzero_ps(); for (auto _ : state) { (void)_; alignas(64) float x[16], y[16], r[16]; _mm512_store_ps(x, inputs[idx]); _mm512_store_ps(y, inputs2[idx]); for (int32_t i = 0; i < 16; ++i) { r[i] = ::hypotf(x[i], y[i]); } sum = _mm512_add_ps(sum, _mm512_load_ps(r)); idx = (idx + 1) & bench::kInputsMask; } state.SetItemsProcessed(state.iterations() * 16); bench::consumeResult(sum); } void BM_pow_libc_avx512(benchmark::State& state) { const auto& bases = bench::powBaseInputs(); const auto& exps = bench::powExpInputs(); size_t idx = 0; Flt sum = _mm512_setzero_ps(); for (auto _ : state) { (void)_; alignas(64) float x[16], y[16], r[16]; _mm512_store_ps(x, bases[idx]); _mm512_store_ps(y, exps[idx]); for (int32_t j = 0; j < 16; ++j) r[j] = ::powf(x[j], y[j]); sum = _mm512_add_ps(sum, _mm512_load_ps(r)); idx = (idx + 1) & bench::kInputsMask; } state.SetItemsProcessed(state.iterations() * 16); bench::consumeResult(sum); } // --- Registrations --- BENCHMARK(BM_sin_avx512); BENCHMARK(BM_cos_avx512); BENCHMARK(BM_tan_avx512); BENCHMARK(BM_atan_avx512); BENCHMARK(BM_acos_avx512); BENCHMARK(BM_asin_avx512); BENCHMARK(BM_exp_avx512); BENCHMARK(BM_exp2_avx512); BENCHMARK(BM_exp10_avx512); BENCHMARK(BM_expm1_avx512); BENCHMARK(BM_log_avx512); BENCHMARK(BM_log2_avx512); BENCHMARK(BM_log10_avx512); BENCHMARK(BM_log1p_avx512); BENCHMARK(BM_cbrt_avx512); BENCHMARK(BM_frexp_avx512); BENCHMARK(BM_ldexp_avx512); BENCHMARK(BM_tanh_avx512); BENCHMARK(BM_erf_avx512); BENCHMARK(BM_atan2_avx512); BENCHMARK(BM_hypot_avx512); BENCHMARK(BM_hypot_avx512_bounds); BENCHMARK(BM_hypot_libc_avx512); BENCHMARK(BM_pow_avx512); BENCHMARK(BM_pow_avx512_accurate); BENCHMARK(BM_pow_avx512_scalar_exp); BENCHMARK(BM_pow_libc_avx512); #else // !defined(__AVX512F__) int main() { std::cout << "AVX-512 not available, skipping benchmarks." << std::endl; return 0; } #endif // defined(__AVX512F__) ================================================ FILE: benchmarks/fast_math/avx_benchmarks.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include "benchmark_helpers.h" #if defined(__AVX2__) namespace dfm = dispenso::fast_math; namespace bench = dispenso::fast_math::bench; using Flt = __m256; struct BoundsTraits { static constexpr bool kMaxAccuracy = false; static constexpr bool kBoundsValues = true; }; // --- One-arg benchmarks --- void BM_sin_avx(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](auto x) { return dfm::sin(x); }); } void BM_sin_avx_accurate(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](auto x) { return dfm::sin(x); }); } void BM_cos_avx(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](auto x) { return dfm::cos(x); }); } void BM_cos_avx_accurate(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](auto x) { return dfm::cos(x); }); } void BM_tan_avx(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](auto x) { return dfm::tan(x); }); } void BM_atan_avx(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](auto x) { return dfm::atan(x); }); } void BM_acos_avx(benchmark::State& state) { bench::runBench(state, bench::acosInputs(), [](auto x) { return dfm::acos(x); }); } void BM_asin_avx(benchmark::State& state) { bench::runBench(state, bench::acosInputs(), [](auto x) { return dfm::asin(x); }); } void BM_exp_avx(benchmark::State& state) { bench::runBench(state, bench::expInputs(), [](auto x) { return dfm::exp(x); }); } void BM_exp_avx_accurate(benchmark::State& state) { bench::runBench(state, bench::expInputs(), [](auto x) { return dfm::exp(x); }); } void BM_exp_avx_bounds(benchmark::State& state) { bench::runBench( state, bench::expInputs(), [](auto x) { return dfm::exp(x); }); } void BM_exp2_avx(benchmark::State& state) { bench::runBench(state, bench::expInputs(), [](auto x) { return dfm::exp2(x); }); } void BM_exp10_avx(benchmark::State& state) { bench::runBench(state, bench::expInputs(), [](auto x) { return dfm::exp10(x); }); } void BM_expm1_avx(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](auto x) { return dfm::expm1(x); }); } void BM_log_avx(benchmark::State& state) { bench::runBench(state, bench::logInputs(), [](auto x) { return dfm::log(x); }); } void BM_log_avx_accurate(benchmark::State& state) { bench::runBench(state, bench::logInputs(), [](auto x) { return dfm::log(x); }); } void BM_log2_avx(benchmark::State& state) { bench::runBench(state, bench::logInputs(), [](auto x) { return dfm::log2(x); }); } void BM_log10_avx(benchmark::State& state) { bench::runBench(state, bench::logInputs(), [](auto x) { return dfm::log10(x); }); } void BM_log1p_avx(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](Flt x) { Flt ax = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), x); return dfm::log1p(ax); }); } void BM_cbrt_avx(benchmark::State& state) { bench::runBench(state, bench::logInputs(), [](auto x) { return dfm::cbrt(x); }); } void BM_cbrt_avx_accurate(benchmark::State& state) { bench::runBench(state, bench::logInputs(), [](auto x) { return dfm::cbrt(x); }); } void BM_frexp_avx(benchmark::State& state) { bench::runBench(state, bench::logInputs(), [](auto x) { dfm::IntType_t e; return dfm::frexp(x, &e); }); } void BM_ldexp_avx(benchmark::State& state) { bench::runBench( state, bench::logInputs(), [](auto x) { return dfm::ldexp(x, _mm256_set1_epi32(3)); }); } void BM_tanh_avx(benchmark::State& state) { bench::runBench(state, bench::tanhInputs(), [](auto x) { return dfm::tanh(x); }); } void BM_erf_avx(benchmark::State& state) { bench::runBench(state, bench::erfInputs(), [](auto x) { return dfm::erf(x); }); } // --- Two-arg benchmarks --- void BM_atan2_avx(benchmark::State& state) { bench::runBench2(state, bench::expInputs(), bench::sinInputs(), [](auto y, auto x) { return dfm::atan2(y, x); }); } void BM_hypot_avx(benchmark::State& state) { bench::runBench2(state, bench::hypotInputs(), bench::sinInputs(), [](auto x, auto y) { return dfm::hypot(x, y); }); } void BM_hypot_avx_bounds(benchmark::State& state) { bench::runBench2(state, bench::hypotInputs(), bench::sinInputs(), [](auto x, auto y) { return dfm::hypot(x, y); }); } void BM_pow_avx(benchmark::State& state) { bench::runBench2( state, bench::powBaseInputs(), bench::powExpInputs(), [](auto b, auto e) { return dfm::pow(b, e); }); } void BM_pow_avx_accurate(benchmark::State& state) { bench::runBench2( state, bench::powBaseInputs(), bench::powExpInputs(), [](auto b, auto e) { return dfm::pow(b, e); }); } void BM_pow_avx_scalar_exp(benchmark::State& state) { bench::runBench(state, bench::powBaseInputs(), [](auto x) { return dfm::pow(x, 2.5f); }); } // --- Libc-packed comparison (AVX-specific, kept hand-written) --- void BM_pow_libc_avx(benchmark::State& state) { const auto& bases = bench::powBaseInputs(); const auto& exps = bench::powExpInputs(); size_t idx = 0; Flt sum = _mm256_setzero_ps(); for (auto _ : state) { (void)_; alignas(32) float x[8], y[8], r[8]; _mm256_store_ps(x, bases[idx]); _mm256_store_ps(y, exps[idx]); for (int32_t i = 0; i < 8; ++i) { r[i] = ::powf(x[i], y[i]); } sum = _mm256_add_ps(sum, _mm256_load_ps(r)); idx = (idx + 1) & bench::kInputsMask; } state.SetItemsProcessed(state.iterations() * 8); bench::consumeResult(sum); } // --- Registrations --- BENCHMARK(BM_sin_avx); BENCHMARK(BM_sin_avx_accurate); BENCHMARK(BM_cos_avx); BENCHMARK(BM_cos_avx_accurate); BENCHMARK(BM_tan_avx); BENCHMARK(BM_atan_avx); BENCHMARK(BM_acos_avx); BENCHMARK(BM_asin_avx); BENCHMARK(BM_exp_avx); BENCHMARK(BM_exp_avx_accurate); BENCHMARK(BM_exp_avx_bounds); BENCHMARK(BM_exp2_avx); BENCHMARK(BM_exp10_avx); BENCHMARK(BM_expm1_avx); BENCHMARK(BM_log_avx); BENCHMARK(BM_log_avx_accurate); BENCHMARK(BM_log2_avx); BENCHMARK(BM_log10_avx); BENCHMARK(BM_log1p_avx); BENCHMARK(BM_cbrt_avx); BENCHMARK(BM_cbrt_avx_accurate); BENCHMARK(BM_frexp_avx); BENCHMARK(BM_ldexp_avx); BENCHMARK(BM_tanh_avx); BENCHMARK(BM_erf_avx); BENCHMARK(BM_atan2_avx); BENCHMARK(BM_hypot_avx); BENCHMARK(BM_hypot_avx_bounds); BENCHMARK(BM_pow_avx); BENCHMARK(BM_pow_avx_accurate); BENCHMARK(BM_pow_avx_scalar_exp); BENCHMARK(BM_pow_libc_avx); #else // !defined(__AVX2__) int main() { std::cout << "AVX2 not available, skipping benchmarks." << std::endl; return 0; } #endif // defined(__AVX2__) ================================================ FILE: benchmarks/fast_math/benchmark_helpers.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #if __has_include("hwy/highway.h") #include "hwy/highway.h" #endif namespace dispenso { namespace fast_math { namespace bench { constexpr size_t kNumInputs = 4096; constexpr size_t kInputsMask = 4095; constexpr int32_t kMaxBenchLanes = 64; // --- Per-type primitives --- // // Each SIMD type needs: laneCount, loadVec, zeroVec, addVec, consumeResult. // These are template specializations so the compiler sees the exact operations // at each call site, ensuring zero overhead. template inline int32_t laneCount(); template inline Flt loadVec(const float* data); template inline Flt zeroVec(); template inline Flt addVec(Flt a, Flt b); template inline void consumeResult(Flt sum); // --- Scalar (float) --- template <> inline int32_t laneCount() { return 1; } template <> inline float loadVec(const float* data) { return *data; } template <> inline float zeroVec() { return 0.0f; } template <> inline float addVec(float a, float b) { return a + b; } template <> inline void consumeResult(float sum) { std::cout << sum << std::endl; } // --- SSE (__m128) --- #if defined(__SSE4_1__) template <> inline int32_t laneCount<__m128>() { return 4; } template <> inline __m128 loadVec<__m128>(const float* data) { return _mm_load_ps(data); } template <> inline __m128 zeroVec<__m128>() { return _mm_setzero_ps(); } template <> inline __m128 addVec<__m128>(__m128 a, __m128 b) { return _mm_add_ps(a, b); } template <> inline void consumeResult<__m128>(__m128 sum) { alignas(16) float buf[4]; _mm_store_ps(buf, sum); std::cout << buf[0] + buf[1] + buf[2] + buf[3] << std::endl; } #endif // __SSE4_1__ // --- AVX (__m256) --- #if defined(__AVX2__) template <> inline int32_t laneCount<__m256>() { return 8; } template <> inline __m256 loadVec<__m256>(const float* data) { return _mm256_load_ps(data); } template <> inline __m256 zeroVec<__m256>() { return _mm256_setzero_ps(); } template <> inline __m256 addVec<__m256>(__m256 a, __m256 b) { return _mm256_add_ps(a, b); } template <> inline void consumeResult<__m256>(__m256 sum) { alignas(32) float buf[8]; _mm256_store_ps(buf, sum); float total = 0.0f; for (int32_t i = 0; i < 8; ++i) total += buf[i]; std::cout << total << std::endl; } #endif // __AVX2__ // --- AVX-512 (__m512) --- #if defined(__AVX512F__) template <> inline int32_t laneCount<__m512>() { return 16; } template <> inline __m512 loadVec<__m512>(const float* data) { return _mm512_load_ps(data); } template <> inline __m512 zeroVec<__m512>() { return _mm512_setzero_ps(); } template <> inline __m512 addVec<__m512>(__m512 a, __m512 b) { return _mm512_add_ps(a, b); } template <> inline void consumeResult<__m512>(__m512 sum) { alignas(64) float buf[16]; _mm512_store_ps(buf, sum); float total = 0.0f; for (int32_t i = 0; i < 16; ++i) total += buf[i]; std::cout << total << std::endl; } #endif // __AVX512F__ // --- NEON (float32x4_t) --- #if defined(__aarch64__) template <> inline int32_t laneCount() { return 4; } template <> inline float32x4_t loadVec(const float* data) { return vld1q_f32(data); } template <> inline float32x4_t zeroVec() { return vdupq_n_f32(0.0f); } template <> inline float32x4_t addVec(float32x4_t a, float32x4_t b) { return vaddq_f32(a, b); } template <> inline void consumeResult(float32x4_t sum) { alignas(16) float buf[4]; vst1q_f32(buf, sum); std::cout << buf[0] + buf[1] + buf[2] + buf[3] << std::endl; } #endif // __aarch64__ // --- Highway (HwyFloat) --- #if __has_include("hwy/highway.h") namespace hn = hwy::HWY_NAMESPACE; template <> inline int32_t laneCount() { return static_cast(hn::Lanes(HwyFloatTag())); } template <> inline HwyFloat loadVec(const float* data) { return hn::LoadU(HwyFloatTag(), data); } template <> inline HwyFloat zeroVec() { return hn::Zero(HwyFloatTag()); } template <> inline HwyFloat addVec(HwyFloat a, HwyFloat b) { return hn::Add(a.v, b.v); } template <> inline void consumeResult(HwyFloat sum) { const HwyFloatTag d; constexpr size_t kMaxLanes = HWY_MAX_BYTES / sizeof(float); HWY_ALIGN float buf[kMaxLanes]; hn::StoreU(sum.v, d, buf); float total = 0.0f; const size_t N = hn::Lanes(d); for (size_t i = 0; i < N; ++i) { total += buf[i]; } std::cout << total << std::endl; } #endif // hwy/highway.h // --- Input generation --- // // makeInputs(lo, hi) generates kNumInputs vectors covering the scalar // range [lo, hi] at delta = (hi - lo) / kNumInputs spacing. Each SIMD vector // packs N consecutive values, so the total coverage is N * (hi - lo). template inline std::vector makeInputs(float lo, float hi) { const int32_t N = laneCount(); float delta = (hi - lo) / static_cast(kNumInputs); std::vector inputs; inputs.reserve(kNumInputs); alignas(64) float buf[kMaxBenchLanes]; float f = lo; for (size_t i = 0; i < kNumInputs; ++i) { for (int32_t j = 0; j < N; ++j) { buf[j] = f + static_cast(j) * delta; } inputs.push_back(loadVec(buf)); f += static_cast(N) * delta; } return inputs; } // --- Pre-defined input factories --- // // These match the ranges used across all SIMD benchmark files. template inline const std::vector& sinInputs() { static auto v = makeInputs(static_cast(-M_PI / 2.0), static_cast(M_PI / 2.0)); return v; } template inline const std::vector& expInputs() { static auto v = makeInputs(-10.0f, 10.0f); return v; } template inline const std::vector& logInputs() { static auto v = makeInputs(0.001f, 10000.0f); return v; } template inline const std::vector& acosInputs() { static auto v = makeInputs(-0.999f, 0.999f); return v; } template inline const std::vector& hypotInputs() { static auto v = makeInputs(-100000.0f, 100000.0f); return v; } template inline const std::vector& tanhInputs() { static auto v = makeInputs(-5.0f, 5.0f); return v; } template inline const std::vector& erfInputs() { static auto v = makeInputs(-4.0f, 4.0f); return v; } template inline const std::vector& powBaseInputs() { static auto v = makeInputs(0.01f, 100.0f); return v; } template inline const std::vector& powExpInputs() { static auto v = makeInputs(-8.0f, 8.0f); return v; } // --- Benchmark runners --- // // runBench: one-arg function benchmark. Func is a template parameter (lambda), // so the compiler sees the exact call target and inlines at -O2. This produces // the same assembly as hand-written code. // // runBench2: two-arg function benchmark (atan2, hypot, pow). template inline void runBench(benchmark::State& state, const std::vector& inputs, Func fn) { size_t idx = 0; Flt sum = zeroVec(); for (auto _ : state) { (void)_; sum = addVec(sum, fn(inputs[idx])); idx = (idx + 1) & kInputsMask; } state.SetItemsProcessed(state.iterations() * static_cast(laneCount())); consumeResult(sum); } template inline void runBench2( benchmark::State& state, const std::vector& xInputs, const std::vector& yInputs, Func fn) { size_t idx = 0; Flt sum = zeroVec(); for (auto _ : state) { (void)_; sum = addVec(sum, fn(xInputs[idx], yInputs[idx])); idx = (idx + 1) & kInputsMask; } state.SetItemsProcessed(state.iterations() * static_cast(laneCount())); consumeResult(sum); } } // namespace bench } // namespace fast_math } // namespace dispenso ================================================ FILE: benchmarks/fast_math/benchmarks.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include "benchmark_helpers.h" namespace dfm = dispenso::fast_math; namespace bench = dispenso::fast_math::bench; constexpr size_t kNumInputs = 4096; constexpr size_t kInputsMask = 4095; struct BoundsTraits { static constexpr bool kMaxAccuracy = false; static constexpr bool kBoundsValues = true; }; // --- Input generators (scalar-specific ranges, different from SIMD) --- const std::vector& acosInputs() { static std::vector inputs = []() { float delta = 2.0f / kNumInputs; std::vector inp; for (float f = -1.0f; f <= 1.0f; f += delta) { inp.push_back(f); } while (inp.size() < kNumInputs) { inp.push_back(inp.back()); } return inp; }(); return inputs; } const std::vector& cbrtInputs() { static std::vector inputs = []() { float delta = 50000.0f / kNumInputs; std::vector inp; for (float f = -50000.f; f <= 50000.f; f += delta) { inp.push_back(f); } while (inp.size() < kNumInputs) { inp.push_back(inp.back()); } return inp; }(); return inputs; } const std::vector& sinInputs() { static std::vector inputs = []() { float delta = M_PI / kNumInputs; std::vector inp; for (float f = -M_PI / 2.0; f <= M_PI / 2.0; f += delta) { inp.push_back(f); } while (inp.size() < kNumInputs) { inp.push_back(inp.back()); } return inp; }(); return inputs; } const std::vector& logInputs() { static std::vector inputs = []() { float delta = 10000.0f / kNumInputs; std::vector inp; for (float f = 0.0f; f <= 10000.0f; f += delta) { inp.push_back(f); } while (inp.size() < kNumInputs) { inp.push_back(inp.back()); } return inp; }(); return inputs; } const std::vector& hypotInputs() { static std::vector inputs = []() { float delta = 200000.0f / kNumInputs; std::vector inp; for (float f = -100000.f; f <= 100000.f; f += delta) { inp.push_back(f); } while (inp.size() < kNumInputs) { inp.push_back(inp.back()); } return inp; }(); return inputs; } const std::vector& powBaseInputs() { static std::vector inputs = []() { float delta = 99.99f / kNumInputs; std::vector inp; for (float f = 0.01f; f <= 100.0f; f += delta) { inp.push_back(f); } while (inp.size() < kNumInputs) { inp.push_back(inp.back()); } return inp; }(); return inputs; } const std::vector& powExpInputs() { static std::vector inputs = []() { float delta = 16.0f / kNumInputs; std::vector inp; for (float f = -8.0f; f <= 8.0f; f += delta) { inp.push_back(f); } while (inp.size() < kNumInputs) { inp.push_back(inp.back()); } return inp; }(); return inputs; } const std::vector& tanhInputs() { static std::vector inputs = []() { float delta = 10.0f / kNumInputs; std::vector inp; for (float f = -5.0f; inp.size() < kNumInputs; f += delta) { inp.push_back(f); } return inp; }(); return inputs; } // --- Libc benchmarks --- void BM_acos(benchmark::State& state) { bench::runBench(state, acosInputs(), [](auto x) { return ::acosf(x); }); } void BM_asin(benchmark::State& state) { bench::runBench(state, acosInputs(), [](auto x) { return ::asinf(x); }); } void BM_atan(benchmark::State& state) { bench::runBench(state, cbrtInputs(), [](auto x) { return ::atanf(x); }); } void BM_cbrt(benchmark::State& state) { bench::runBench(state, cbrtInputs(), [](auto x) { return ::cbrtf(x); }); } void BM_sin(benchmark::State& state) { bench::runBench(state, sinInputs(), [](auto x) { return ::sinf(x); }); } void BM_cos(benchmark::State& state) { bench::runBench(state, sinInputs(), [](auto x) { return ::cosf(x); }); } void BM_tan(benchmark::State& state) { bench::runBench(state, sinInputs(), [](auto x) { return ::tanf(x); }); } void BM_exp(benchmark::State& state) { bench::runBench(state, sinInputs(), [](auto x) { return ::expf(x); }); } void BM_exp2(benchmark::State& state) { bench::runBench(state, sinInputs(), [](auto x) { return ::exp2f(x); }); } void BM_exp10(benchmark::State& state) { bench::runBench(state, sinInputs(), [](auto x) { return ::powf(10.0f, x); }); } void BM_log(benchmark::State& state) { bench::runBench(state, logInputs(), [](auto x) { return ::logf(x); }); } void BM_log2(benchmark::State& state) { bench::runBench(state, logInputs(), [](auto x) { return ::log2f(x); }); } void BM_log10(benchmark::State& state) { bench::runBench(state, logInputs(), [](auto x) { return ::log10f(x); }); } void BM_expm1(benchmark::State& state) { bench::runBench(state, sinInputs(), [](auto x) { return ::expm1f(x); }); } void BM_log1p(benchmark::State& state) { bench::runBench(state, sinInputs(), [](auto x) { return ::log1pf(std::fabs(x)); }); } void BM_tanh(benchmark::State& state) { bench::runBench(state, tanhInputs(), [](auto x) { return ::tanhf(x); }); } void BM_sin_plus_cos(benchmark::State& state) { bench::runBench(state, sinInputs(), [](auto x) { return ::sinf(x) + ::cosf(x); }); } void BM_atan2(benchmark::State& state) { bench::runBench2(state, cbrtInputs(), sinInputs(), [](auto y, auto x) { return ::atan2f(y, x); }); } void BM_hypot(benchmark::State& state) { bench::runBench2( state, hypotInputs(), sinInputs(), [](auto x, auto y) { return ::hypotf(x, y); }); } void BM_pow(benchmark::State& state) { bench::runBench2( state, powBaseInputs(), powExpInputs(), [](auto b, auto e) { return ::powf(b, e); }); } // frexp and ldexp use non-standard loop patterns — kept hand-written. void BM_frexp(benchmark::State& state) { const auto& inputs = sinInputs(); size_t idx = 0; float sum = 0.0f; int exp; int64_t expSum = 0; for (auto _ : state) { (void)_; sum += ::frexpf(inputs[idx], &exp); expSum += exp; idx = (idx + 1) & kInputsMask; } state.SetItemsProcessed(state.iterations()); std::cout << sum << " " << expSum << std::endl; } void BM_ldexp(benchmark::State& state) { const auto& inputs = sinInputs(); size_t idx = 0; float sum = 0.0f; for (auto _ : state) { (void)_; sum += ::ldexpf(inputs[idx], idx & 7); idx = (idx + 1) & kInputsMask; } state.SetItemsProcessed(state.iterations()); std::cout << sum << std::endl; } // --- Dispenso fast_math benchmarks --- void BM_fastm_acos(benchmark::State& state) { bench::runBench(state, acosInputs(), [](auto x) { return dfm::acos(x); }); } void BM_fastm_asin(benchmark::State& state) { bench::runBench(state, acosInputs(), [](auto x) { return dfm::asin(x); }); } void BM_fastm_atan(benchmark::State& state) { bench::runBench(state, cbrtInputs(), [](auto x) { return dfm::atan(x); }); } void BM_fastm_cbrt(benchmark::State& state) { bench::runBench(state, cbrtInputs(), [](auto x) { return dfm::cbrt(x); }); } void BM_fastm_cbrt_accurate(benchmark::State& state) { bench::runBench( state, cbrtInputs(), [](auto x) { return dfm::cbrt(x); }); } void BM_fastm_sin(benchmark::State& state) { bench::runBench(state, sinInputs(), [](auto x) { return dfm::sin(x); }); } void BM_fastm_sin_accurate(benchmark::State& state) { bench::runBench( state, sinInputs(), [](auto x) { return dfm::sin(x); }); } void BM_fastm_cos(benchmark::State& state) { bench::runBench(state, sinInputs(), [](auto x) { return dfm::cos(x); }); } void BM_fastm_cos_accurate(benchmark::State& state) { bench::runBench( state, sinInputs(), [](auto x) { return dfm::cos(x); }); } void BM_fastm_tan(benchmark::State& state) { bench::runBench(state, sinInputs(), [](auto x) { return dfm::tan(x); }); } void BM_fastm_tan_accurate(benchmark::State& state) { bench::runBench( state, sinInputs(), [](auto x) { return dfm::tan(x); }); } void BM_fastm_exp(benchmark::State& state) { bench::runBench(state, sinInputs(), [](auto x) { return dfm::exp(x); }); } void BM_fastm_exp_bounds(benchmark::State& state) { bench::runBench(state, sinInputs(), [](auto x) { return dfm::exp(x); }); } void BM_fastm_exp_accurate(benchmark::State& state) { bench::runBench( state, sinInputs(), [](auto x) { return dfm::exp(x); }); } void BM_fastm_exp2(benchmark::State& state) { bench::runBench(state, sinInputs(), [](auto x) { return dfm::exp2(x); }); } void BM_fastm_exp2_accurate(benchmark::State& state) { bench::runBench( state, sinInputs(), [](auto x) { return dfm::exp2(x); }); } void BM_fastm_exp10(benchmark::State& state) { bench::runBench(state, sinInputs(), [](auto x) { return dfm::exp10(x); }); } void BM_fastm_exp10_accurate(benchmark::State& state) { bench::runBench( state, sinInputs(), [](auto x) { return dfm::exp10(x); }); } void BM_fastm_log(benchmark::State& state) { bench::runBench(state, logInputs(), [](auto x) { return dfm::log(x); }); } void BM_fastm_log_accurate(benchmark::State& state) { bench::runBench( state, logInputs(), [](auto x) { return dfm::log(x); }); } void BM_fastm_log2(benchmark::State& state) { bench::runBench(state, logInputs(), [](auto x) { return dfm::log2(x); }); } void BM_fastm_log2_accurate(benchmark::State& state) { bench::runBench( state, logInputs(), [](auto x) { return dfm::log2(x); }); } void BM_fastm_log10(benchmark::State& state) { bench::runBench(state, logInputs(), [](auto x) { return dfm::log10(x); }); } void BM_fastm_log10_accurate(benchmark::State& state) { bench::runBench( state, logInputs(), [](auto x) { return dfm::log10(x); }); } void BM_fastm_expm1(benchmark::State& state) { bench::runBench(state, sinInputs(), [](auto x) { return dfm::expm1(x); }); } void BM_fastm_log1p(benchmark::State& state) { bench::runBench(state, sinInputs(), [](auto x) { return dfm::log1p(std::fabs(x)); }); } void BM_fastm_tanh(benchmark::State& state) { bench::runBench(state, tanhInputs(), [](auto x) { return dfm::tanh(x); }); } void BM_fastm_atan2(benchmark::State& state) { bench::runBench2( state, cbrtInputs(), sinInputs(), [](auto y, auto x) { return dfm::atan2(y, x); }); } void BM_fastm_atan2_bounds(benchmark::State& state) { bench::runBench2(state, cbrtInputs(), sinInputs(), [](auto y, auto x) { return dfm::atan2(y, x); }); } void BM_fastm_hypot(benchmark::State& state) { bench::runBench2( state, hypotInputs(), sinInputs(), [](auto x, auto y) { return dfm::hypot(x, y); }); } void BM_fastm_hypot_bounds(benchmark::State& state) { bench::runBench2(state, hypotInputs(), sinInputs(), [](auto x, auto y) { return dfm::hypot(x, y); }); } void BM_naive_hypot(benchmark::State& state) { bench::runBench2( state, hypotInputs(), sinInputs(), [](auto x, auto y) { return sqrtf(fmaf(x, x, y * y)); }); } void BM_fastm_pow(benchmark::State& state) { bench::runBench2( state, powBaseInputs(), powExpInputs(), [](auto b, auto e) { return dfm::pow(b, e); }); } void BM_fastm_pow_accurate(benchmark::State& state) { bench::runBench2(state, powBaseInputs(), powExpInputs(), [](auto b, auto e) { return dfm::pow(b, e); }); } // frexp and ldexp use non-standard loop patterns — kept hand-written. void BM_fastm_frexp(benchmark::State& state) { const auto& inputs = sinInputs(); size_t idx = 0; float sum = 0.0f; int exp; int64_t expSum = 0; for (auto _ : state) { (void)_; sum += dfm::frexp(inputs[idx], &exp); expSum += exp; idx = (idx + 1) & kInputsMask; } state.SetItemsProcessed(state.iterations()); std::cout << sum << " " << expSum << std::endl; } void BM_fastm_ldexp(benchmark::State& state) { const auto& inputs = sinInputs(); size_t idx = 0; float sum = 0.0f; for (auto _ : state) { (void)_; sum += dfm::ldexp(inputs[idx], idx & 7); idx = (idx + 1) & kInputsMask; } state.SetItemsProcessed(state.iterations()); std::cout << sum << std::endl; } // --- sincos / sincospi benchmarks --- void BM_fastm_sin_plus_cos(benchmark::State& state) { bench::runBench(state, sinInputs(), [](auto x) { return dfm::sin(x) + dfm::cos(x); }); } void BM_fastm_sincos(benchmark::State& state) { bench::runBench(state, sinInputs(), [](auto x) { float s, c; dfm::sincos(x, &s, &c); return s + c; }); } void BM_fastm_sinpi(benchmark::State& state) { bench::runBench(state, sinInputs(), [](auto x) { return dfm::sinpi(x); }); } void BM_fastm_cospi(benchmark::State& state) { bench::runBench(state, sinInputs(), [](auto x) { return dfm::cospi(x); }); } void BM_fastm_sincospi(benchmark::State& state) { bench::runBench(state, sinInputs(), [](auto x) { float s, c; dfm::sincospi(x, &s, &c); return s + c; }); } // --- Batch benchmarks: explicit SIMD via SseFloat (4-wide SSE) --- #if defined(__SSE4_1__) #include static void BM_batch_sinf(benchmark::State& state) { const auto& inputs = sinInputs(); alignas(16) float outputs[kNumInputs]; for (auto _ : state) { (void)_; for (size_t i = 0; i < kNumInputs; ++i) { outputs[i] = ::sinf(inputs[i]); } benchmark::DoNotOptimize(outputs); } state.SetItemsProcessed(state.iterations() * kNumInputs); } static void BM_batch_sin_scalar(benchmark::State& state) { const auto& inputs = sinInputs(); alignas(16) float outputs[kNumInputs]; for (auto _ : state) { (void)_; for (size_t i = 0; i < kNumInputs; ++i) { outputs[i] = dfm::sin(inputs[i]); } benchmark::DoNotOptimize(outputs); } state.SetItemsProcessed(state.iterations() * kNumInputs); } static void BM_batch_sin_sse(benchmark::State& state) { using namespace dispenso::fast_math; const auto& inputs = sinInputs(); alignas(16) float outputs[kNumInputs]; for (auto _ : state) { (void)_; for (size_t i = 0; i < kNumInputs; i += 4) { SseFloat x = _mm_loadu_ps(&inputs[i]); SseFloat r = sin(x); _mm_storeu_ps(&outputs[i], r.v); } benchmark::DoNotOptimize(outputs); } state.SetItemsProcessed(state.iterations() * kNumInputs); } static void BM_batch_cos_scalar(benchmark::State& state) { const auto& inputs = sinInputs(); alignas(16) float outputs[kNumInputs]; for (auto _ : state) { (void)_; for (size_t i = 0; i < kNumInputs; ++i) { outputs[i] = dfm::cos(inputs[i]); } benchmark::DoNotOptimize(outputs); } state.SetItemsProcessed(state.iterations() * kNumInputs); } static void BM_batch_cos_sse(benchmark::State& state) { using namespace dispenso::fast_math; const auto& inputs = sinInputs(); alignas(16) float outputs[kNumInputs]; for (auto _ : state) { (void)_; for (size_t i = 0; i < kNumInputs; i += 4) { SseFloat x = _mm_loadu_ps(&inputs[i]); SseFloat r = cos(x); _mm_storeu_ps(&outputs[i], r.v); } benchmark::DoNotOptimize(outputs); } state.SetItemsProcessed(state.iterations() * kNumInputs); } BENCHMARK(BM_batch_sinf); BENCHMARK(BM_batch_sin_scalar); BENCHMARK(BM_batch_sin_sse); BENCHMARK(BM_batch_cos_scalar); BENCHMARK(BM_batch_cos_sse); #endif // __SSE4_1__ // --- Registrations --- BENCHMARK(BM_acos); BENCHMARK(BM_fastm_acos); BENCHMARK(BM_asin); BENCHMARK(BM_fastm_asin); BENCHMARK(BM_atan); BENCHMARK(BM_fastm_atan); BENCHMARK(BM_atan2); BENCHMARK(BM_fastm_atan2); BENCHMARK(BM_fastm_atan2_bounds); BENCHMARK(BM_cbrt); BENCHMARK(BM_fastm_cbrt); BENCHMARK(BM_fastm_cbrt_accurate); BENCHMARK(BM_exp); BENCHMARK(BM_fastm_exp); BENCHMARK(BM_fastm_exp_bounds); BENCHMARK(BM_fastm_exp_accurate); BENCHMARK(BM_exp10); BENCHMARK(BM_fastm_exp10); BENCHMARK(BM_fastm_exp10_accurate); BENCHMARK(BM_exp2); BENCHMARK(BM_fastm_exp2); BENCHMARK(BM_fastm_exp2_accurate); BENCHMARK(BM_log); BENCHMARK(BM_fastm_log); BENCHMARK(BM_fastm_log_accurate); BENCHMARK(BM_log2); BENCHMARK(BM_fastm_log2); BENCHMARK(BM_fastm_log2_accurate); BENCHMARK(BM_log10); BENCHMARK(BM_fastm_log10); BENCHMARK(BM_fastm_log10_accurate); BENCHMARK(BM_sin); BENCHMARK(BM_fastm_sin); BENCHMARK(BM_fastm_sin_accurate); BENCHMARK(BM_cos); BENCHMARK(BM_fastm_cos); BENCHMARK(BM_fastm_cos_accurate); BENCHMARK(BM_frexp); BENCHMARK(BM_fastm_frexp); BENCHMARK(BM_ldexp); BENCHMARK(BM_fastm_ldexp); BENCHMARK(BM_tan); BENCHMARK(BM_fastm_tan); BENCHMARK(BM_fastm_tan_accurate); BENCHMARK(BM_hypot); BENCHMARK(BM_fastm_hypot); BENCHMARK(BM_naive_hypot); BENCHMARK(BM_fastm_hypot_bounds); BENCHMARK(BM_sin_plus_cos); BENCHMARK(BM_fastm_sin_plus_cos); BENCHMARK(BM_fastm_sincos); BENCHMARK(BM_fastm_sinpi); BENCHMARK(BM_fastm_cospi); BENCHMARK(BM_fastm_sincospi); BENCHMARK(BM_pow); BENCHMARK(BM_fastm_pow); BENCHMARK(BM_fastm_pow_accurate); BENCHMARK(BM_expm1); BENCHMARK(BM_fastm_expm1); BENCHMARK(BM_log1p); BENCHMARK(BM_fastm_log1p); BENCHMARK(BM_tanh); BENCHMARK(BM_fastm_tanh); ================================================ FILE: benchmarks/fast_math/erf_benchmarks.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ // Benchmark: erf S16 (float, t-substitution + inline exp) vs S21 (double, pure polynomial Estrin). // Also benchmarks libc erff for reference. #include #include #include #include #include #include #if defined(__SSE4_1__) #include #if defined(__GNUC__) || defined(__clang__) #define UNUSED_VAR myLocalForLoopVar __attribute__((unused)) #elif defined(_MSC_VER) #define UNUSED_VAR myLocalForLoopVar __pragma(warning(suppress : 4100)) #else #define UNUSED_VAR myLocalForLoopVar #endif namespace dfm = dispenso::fast_math; constexpr size_t kNumInputs = 4096; constexpr size_t kInputsMask = 4095; // --- Inputs: [-4, 4] --- const std::vector& erfScalarInputs() { static std::vector inputs = []() { float delta = 8.0f / kNumInputs; std::vector inp; for (float f = -4.0f; inp.size() < kNumInputs; f += delta) { inp.push_back(f); } return inp; }(); return inputs; } const std::vector<__m128>& erfSseInputs() { static std::vector<__m128> inputs = []() { float delta = 8.0f / kNumInputs; std::vector<__m128> inp; float f = -4.0f; for (size_t i = 0; i < kNumInputs; ++i) { inp.emplace_back(_mm_set_ps(f + 3 * delta, f + 2 * delta, f + delta, f)); f += 4 * delta; } return inp; }(); return inputs; } #if defined(__AVX2__) const std::vector<__m256>& erfAvxInputs() { static std::vector<__m256> inputs = []() { float delta = 8.0f / kNumInputs; std::vector<__m256> inp; float f = -4.0f; for (size_t i = 0; i < kNumInputs; ++i) { inp.emplace_back(_mm256_set_ps( f + 7 * delta, f + 6 * delta, f + 5 * delta, f + 4 * delta, f + 3 * delta, f + 2 * delta, f + delta, f)); f += 8 * delta; } return inp; }(); return inputs; } #endif // --- S16: float, t-substitution + inline exp, 2 ULP --- static inline float erf_s16(float x) { float ax = std::fabs(x); float result; if (ax >= 3.92f) { result = 1.0f; } else if (ax >= 0.875f) { constexpr float p = 0.45f; float t = 1.0f / std::fma(p, ax, 1.0f); constexpr float c0 = 0x1.04873ep-2f; constexpr float c1 = 0x1.f81fc6p-3f; constexpr float c2 = 0x1.189f42p-2f; constexpr float c3 = 0x1.15aaa6p-5f; constexpr float c4 = 0x1.65d24ep-2f; constexpr float c5 = -0x1.4432a4p-3f; float poly = t * std::fma(std::fma(std::fma(std::fma(std::fma(c5, t, c4), t, c3), t, c2), t, c1), t, c0); float u = ax * ax; constexpr float kLog2e = 0x1.715476p+0f; constexpr float kLn2hi = 0x1.62e400p-1f; constexpr float kLn2lo = 0x1.7f7d1cp-20f; float k = std::floor(u * kLog2e); float f = std::fma(k, -kLn2hi, u); f = std::fma(k, -kLn2lo, f); constexpr float e0 = 0x1.fffffep-1f, e1 = -0x1.ffff1ep-1f; constexpr float e2 = 0x1.ffe314p-2f, e3 = -0x1.53f876p-3f; constexpr float e4 = 0x1.462f16p-5f, e5 = -0x1.80e5b2p-8f; float exp_neg_f = std::fma(std::fma(std::fma(std::fma(std::fma(e5, f, e4), f, e3), f, e2), f, e1), f, e0); int32_t ki = static_cast(k); float pow2_neg_k = dfm::bit_cast((127 - ki) << 23); result = 1.0f - pow2_neg_k * exp_neg_f * poly; } else { constexpr float c0 = 0x1.20dd76p+0f; constexpr float q0 = -0x1.812746p-2f, q1 = 0x1.ce2ec6p-4f, q2 = -0x1.b81edep-6f; constexpr float q3 = 0x1.556b48p-8f, q4 = -0x1.b0255p-11f, q5 = 0x1.7149c8p-14f; float u = ax * ax; float q = std::fma(std::fma(std::fma(std::fma(std::fma(q5, u, q4), u, q3), u, q2), u, q1), u, q0); result = ax * std::fma(q, u, c0); } return x < 0.0f ? -result : result; } // --- S21: double, pure polynomial Estrin, 1 ULP --- static inline float erf_s21(float x) { double ax = std::fabs(static_cast(x)); double result; if (ax >= 3.92) { result = 1.0; } else { double u = ax * ax; constexpr double c0 = 0x1.20dd74ce6dac1p0; constexpr double c1 = -0x1.812728fedb0c3p-2; constexpr double c2 = 0x1.ce2c679f0f94dp-4; constexpr double c3 = -0x1.b81379b046993p-6; constexpr double c4 = 0x1.55decae500c6cp-8; constexpr double c5 = -0x1.bd402ca3b1d09p-11; constexpr double c6 = 0x1.edfbbbd68d00ep-14; constexpr double c7 = -0x1.d43f94bdfb90fp-17; constexpr double c8 = 0x1.77643daca82f5p-20; constexpr double c9 = -0x1.f276d5cf346ecp-24; constexpr double c10 = 0x1.0a42c17eedcadp-27; constexpr double c11 = -0x1.b999e591ae6bap-32; constexpr double c12 = 0x1.0f73303821975p-36; constexpr double c13 = -0x1.ce969a50741b3p-42; constexpr double c14 = 0x1.e56af7f1b38e4p-48; constexpr double c15 = -0x1.d6f65766c68e5p-55; // Estrin's scheme: 4-level tree double p0 = std::fma(c1, u, c0); double p1 = std::fma(c3, u, c2); double p2 = std::fma(c5, u, c4); double p3 = std::fma(c7, u, c6); double p4 = std::fma(c9, u, c8); double p5 = std::fma(c11, u, c10); double p6 = std::fma(c13, u, c12); double p7 = std::fma(c15, u, c14); double u2 = u * u; double q0 = std::fma(p1, u2, p0); double q1 = std::fma(p3, u2, p2); double q2 = std::fma(p5, u2, p4); double q3 = std::fma(p7, u2, p6); double u4 = u2 * u2; double r0 = std::fma(q1, u4, q0); double r1 = std::fma(q3, u4, q2); double u8 = u4 * u4; double R = std::fma(r1, u8, r0); result = ax * R; } return static_cast(x < 0.0f ? -result : result); } // --- SSE S16 (float4, t-substitution + inline exp) --- static inline __m128 erf_s16_sse(__m128 x) { __m128 ax = dfm::fabs(x); __m128 sign = _mm_and_ps(x, _mm_set1_ps(-0.0f)); // Near-zero path: erf(x) = x * (c0 + u * Q(u)) __m128 u = _mm_mul_ps(ax, ax); constexpr float nc0 = 0x1.20dd76p+0f; constexpr float nq0 = -0x1.812746p-2f, nq1 = 0x1.ce2ec6p-4f, nq2 = -0x1.b81edep-6f; constexpr float nq3 = 0x1.556b48p-8f, nq4 = -0x1.b0255p-11f, nq5 = 0x1.7149c8p-14f; __m128 q = _mm_fmadd_ps( _mm_fmadd_ps( _mm_fmadd_ps( _mm_fmadd_ps( _mm_fmadd_ps(_mm_set1_ps(nq5), u, _mm_set1_ps(nq4)), u, _mm_set1_ps(nq3)), u, _mm_set1_ps(nq2)), u, _mm_set1_ps(nq1)), u, _mm_set1_ps(nq0)); __m128 near_zero = _mm_mul_ps(ax, _mm_fmadd_ps(q, u, _mm_set1_ps(nc0))); // Erfc path: erf(x) = 1 - t*P(t)*exp(-x^2) constexpr float p = 0.45f; __m128 t = _mm_div_ps(_mm_set1_ps(1.0f), _mm_fmadd_ps(_mm_set1_ps(p), ax, _mm_set1_ps(1.0f))); constexpr float pc0 = 0x1.04873ep-2f, pc1 = 0x1.f81fc6p-3f, pc2 = 0x1.189f42p-2f; constexpr float pc3 = 0x1.15aaa6p-5f, pc4 = 0x1.65d24ep-2f, pc5 = -0x1.4432a4p-3f; __m128 ppoly = _mm_mul_ps( t, _mm_fmadd_ps( _mm_fmadd_ps( _mm_fmadd_ps( _mm_fmadd_ps( _mm_fmadd_ps(_mm_set1_ps(pc5), t, _mm_set1_ps(pc4)), t, _mm_set1_ps(pc3)), t, _mm_set1_ps(pc2)), t, _mm_set1_ps(pc1)), t, _mm_set1_ps(pc0))); // Inline exp(-x^2): exp(-u) = 2^(-k) * exp(-f) constexpr float kLog2e = 0x1.715476p+0f; constexpr float kLn2hi = 0x1.62e400p-1f; constexpr float kLn2lo = 0x1.7f7d1cp-20f; __m128 kv = _mm_floor_ps(_mm_mul_ps(u, _mm_set1_ps(kLog2e))); __m128 f = _mm_fmadd_ps(kv, _mm_set1_ps(-kLn2hi), u); f = _mm_fmadd_ps(kv, _mm_set1_ps(-kLn2lo), f); constexpr float e0 = 0x1.fffffep-1f, e1 = -0x1.ffff1ep-1f; constexpr float e2 = 0x1.ffe314p-2f, e3 = -0x1.53f876p-3f; constexpr float e4 = 0x1.462f16p-5f, e5 = -0x1.80e5b2p-8f; __m128 exp_neg_f = _mm_fmadd_ps( _mm_fmadd_ps( _mm_fmadd_ps( _mm_fmadd_ps(_mm_fmadd_ps(_mm_set1_ps(e5), f, _mm_set1_ps(e4)), f, _mm_set1_ps(e3)), f, _mm_set1_ps(e2)), f, _mm_set1_ps(e1)), f, _mm_set1_ps(e0)); __m128i ki = _mm_cvtps_epi32(kv); __m128 pow2_neg_k = _mm_castsi128_ps(_mm_slli_epi32(_mm_sub_epi32(_mm_set1_epi32(127), ki), 23)); __m128 erfc_result = _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(_mm_mul_ps(pow2_neg_k, exp_neg_f), ppoly)); // Blend: use near_zero for |x| < 0.875, erfc for |x| >= 0.875 __m128 use_erfc = _mm_cmpge_ps(ax, _mm_set1_ps(0.875f)); __m128 result = _mm_blendv_ps(near_zero, erfc_result, use_erfc); // Clamp to 1 for |x| >= 3.92 __m128 saturated = _mm_cmpge_ps(ax, _mm_set1_ps(3.92f)); result = _mm_blendv_ps(result, _mm_set1_ps(1.0f), saturated); // Restore sign return _mm_or_ps(result, sign); } // --- SSE S21 (double poly Estrin, process 4 floats via 2x double2) --- static inline __m128 erf_s21_sse(__m128 x) { __m128 ax_f = dfm::fabs(x); __m128 sign = _mm_and_ps(x, _mm_set1_ps(-0.0f)); // Convert to double: split into low 2 and high 2 __m128d ax_lo = _mm_cvtps_pd(ax_f); __m128d ax_hi = _mm_cvtps_pd(_mm_movehl_ps(ax_f, ax_f)); auto eval_poly = [](__m128d ax) -> __m128d { __m128d u = _mm_mul_pd(ax, ax); constexpr double c0 = 0x1.20dd74ce6dac1p0; constexpr double c1 = -0x1.812728fedb0c3p-2; constexpr double c2 = 0x1.ce2c679f0f94dp-4; constexpr double c3 = -0x1.b81379b046993p-6; constexpr double c4 = 0x1.55decae500c6cp-8; constexpr double c5 = -0x1.bd402ca3b1d09p-11; constexpr double c6 = 0x1.edfbbbd68d00ep-14; constexpr double c7 = -0x1.d43f94bdfb90fp-17; constexpr double c8 = 0x1.77643daca82f5p-20; constexpr double c9 = -0x1.f276d5cf346ecp-24; constexpr double c10 = 0x1.0a42c17eedcadp-27; constexpr double c11 = -0x1.b999e591ae6bap-32; constexpr double c12 = 0x1.0f73303821975p-36; constexpr double c13 = -0x1.ce969a50741b3p-42; constexpr double c14 = 0x1.e56af7f1b38e4p-48; constexpr double c15 = -0x1.d6f65766c68e5p-55; // Estrin level 0 __m128d p0 = _mm_fmadd_pd(_mm_set1_pd(c1), u, _mm_set1_pd(c0)); __m128d p1 = _mm_fmadd_pd(_mm_set1_pd(c3), u, _mm_set1_pd(c2)); __m128d p2 = _mm_fmadd_pd(_mm_set1_pd(c5), u, _mm_set1_pd(c4)); __m128d p3 = _mm_fmadd_pd(_mm_set1_pd(c7), u, _mm_set1_pd(c6)); __m128d p4 = _mm_fmadd_pd(_mm_set1_pd(c9), u, _mm_set1_pd(c8)); __m128d p5 = _mm_fmadd_pd(_mm_set1_pd(c11), u, _mm_set1_pd(c10)); __m128d p6 = _mm_fmadd_pd(_mm_set1_pd(c13), u, _mm_set1_pd(c12)); __m128d p7 = _mm_fmadd_pd(_mm_set1_pd(c15), u, _mm_set1_pd(c14)); __m128d u2 = _mm_mul_pd(u, u); // Level 1 __m128d q0 = _mm_fmadd_pd(p1, u2, p0); __m128d q1 = _mm_fmadd_pd(p3, u2, p2); __m128d q2 = _mm_fmadd_pd(p5, u2, p4); __m128d q3 = _mm_fmadd_pd(p7, u2, p6); __m128d u4 = _mm_mul_pd(u2, u2); // Level 2 __m128d r0 = _mm_fmadd_pd(q1, u4, q0); __m128d r1 = _mm_fmadd_pd(q3, u4, q2); __m128d u8 = _mm_mul_pd(u4, u4); // Level 3 __m128d R = _mm_fmadd_pd(r1, u8, r0); return _mm_mul_pd(ax, R); }; __m128d res_lo = eval_poly(ax_lo); __m128d res_hi = eval_poly(ax_hi); // Convert back to float __m128 result = _mm_movelh_ps(_mm_cvtpd_ps(res_lo), _mm_cvtpd_ps(res_hi)); // Clamp to 1 for |x| >= 3.92 __m128 saturated = _mm_cmpge_ps(ax_f, _mm_set1_ps(3.92f)); result = _mm_blendv_ps(result, _mm_set1_ps(1.0f), saturated); // Restore sign return _mm_or_ps(result, sign); } static void consumeSum(__m128 sum) { alignas(16) float buf[4]; _mm_store_ps(buf, sum); std::cout << buf[0] + buf[1] + buf[2] + buf[3] << std::endl; } // --- Scalar benchmarks --- void BM_erf_libc(benchmark::State& state) { const auto& inputs = erfScalarInputs(); size_t idx = 0; float sum = 0.0f; for (auto UNUSED_VAR : state) { sum += ::erff(inputs[idx]); idx = (idx + 1) & kInputsMask; } state.SetItemsProcessed(state.iterations()); std::cout << sum << std::endl; } void BM_erf_s16_scalar(benchmark::State& state) { const auto& inputs = erfScalarInputs(); size_t idx = 0; float sum = 0.0f; for (auto UNUSED_VAR : state) { sum += erf_s16(inputs[idx]); idx = (idx + 1) & kInputsMask; } state.SetItemsProcessed(state.iterations()); std::cout << sum << std::endl; } void BM_erf_s21_scalar(benchmark::State& state) { const auto& inputs = erfScalarInputs(); size_t idx = 0; float sum = 0.0f; for (auto UNUSED_VAR : state) { sum += erf_s21(inputs[idx]); idx = (idx + 1) & kInputsMask; } state.SetItemsProcessed(state.iterations()); std::cout << sum << std::endl; } // --- SSE benchmarks --- void BM_erf_s16_sse(benchmark::State& state) { const auto& inputs = erfSseInputs(); size_t idx = 0; __m128 sum = _mm_setzero_ps(); for (auto UNUSED_VAR : state) { sum = _mm_add_ps(sum, erf_s16_sse(inputs[idx])); idx = (idx + 1) & kInputsMask; } state.SetItemsProcessed(state.iterations() * 4); consumeSum(sum); } void BM_erf_s21_sse(benchmark::State& state) { const auto& inputs = erfSseInputs(); size_t idx = 0; __m128 sum = _mm_setzero_ps(); for (auto UNUSED_VAR : state) { sum = _mm_add_ps(sum, erf_s21_sse(inputs[idx])); idx = (idx + 1) & kInputsMask; } state.SetItemsProcessed(state.iterations() * 4); consumeSum(sum); } BENCHMARK(BM_erf_libc); BENCHMARK(BM_erf_s16_scalar); BENCHMARK(BM_erf_s21_scalar); BENCHMARK(BM_erf_s16_sse); BENCHMARK(BM_erf_s21_sse); #if defined(__AVX2__) static void consumeSum256(__m256 sum) { alignas(32) float buf[8]; _mm256_store_ps(buf, sum); float s = 0; for (int i = 0; i < 8; ++i) s += buf[i]; std::cout << s << std::endl; } // --- AVX S16 --- static inline __m256 erf_s16_avx(__m256 x) { __m256 ax = dfm::fabs(x); __m256 sign = _mm256_and_ps(x, _mm256_set1_ps(-0.0f)); // Near-zero path __m256 u = _mm256_mul_ps(ax, ax); constexpr float nc0 = 0x1.20dd76p+0f; constexpr float nq0 = -0x1.812746p-2f, nq1 = 0x1.ce2ec6p-4f, nq2 = -0x1.b81edep-6f; constexpr float nq3 = 0x1.556b48p-8f, nq4 = -0x1.b0255p-11f, nq5 = 0x1.7149c8p-14f; __m256 q = _mm256_fmadd_ps( _mm256_fmadd_ps( _mm256_fmadd_ps( _mm256_fmadd_ps( _mm256_fmadd_ps(_mm256_set1_ps(nq5), u, _mm256_set1_ps(nq4)), u, _mm256_set1_ps(nq3)), u, _mm256_set1_ps(nq2)), u, _mm256_set1_ps(nq1)), u, _mm256_set1_ps(nq0)); __m256 near_zero = _mm256_mul_ps(ax, _mm256_fmadd_ps(q, u, _mm256_set1_ps(nc0))); // Erfc path constexpr float p = 0.45f; __m256 t = _mm256_div_ps( _mm256_set1_ps(1.0f), _mm256_fmadd_ps(_mm256_set1_ps(p), ax, _mm256_set1_ps(1.0f))); constexpr float pc0 = 0x1.04873ep-2f, pc1 = 0x1.f81fc6p-3f, pc2 = 0x1.189f42p-2f; constexpr float pc3 = 0x1.15aaa6p-5f, pc4 = 0x1.65d24ep-2f, pc5 = -0x1.4432a4p-3f; __m256 ppoly = _mm256_mul_ps( t, _mm256_fmadd_ps( _mm256_fmadd_ps( _mm256_fmadd_ps( _mm256_fmadd_ps( _mm256_fmadd_ps(_mm256_set1_ps(pc5), t, _mm256_set1_ps(pc4)), t, _mm256_set1_ps(pc3)), t, _mm256_set1_ps(pc2)), t, _mm256_set1_ps(pc1)), t, _mm256_set1_ps(pc0))); // Inline exp(-x^2) constexpr float kLog2e = 0x1.715476p+0f; constexpr float kLn2hi = 0x1.62e400p-1f; constexpr float kLn2lo = 0x1.7f7d1cp-20f; __m256 kv = _mm256_floor_ps(_mm256_mul_ps(u, _mm256_set1_ps(kLog2e))); __m256 f = _mm256_fmadd_ps(kv, _mm256_set1_ps(-kLn2hi), u); f = _mm256_fmadd_ps(kv, _mm256_set1_ps(-kLn2lo), f); constexpr float e0 = 0x1.fffffep-1f, e1 = -0x1.ffff1ep-1f; constexpr float e2 = 0x1.ffe314p-2f, e3 = -0x1.53f876p-3f; constexpr float e4 = 0x1.462f16p-5f, e5 = -0x1.80e5b2p-8f; __m256 exp_neg_f = _mm256_fmadd_ps( _mm256_fmadd_ps( _mm256_fmadd_ps( _mm256_fmadd_ps( _mm256_fmadd_ps(_mm256_set1_ps(e5), f, _mm256_set1_ps(e4)), f, _mm256_set1_ps(e3)), f, _mm256_set1_ps(e2)), f, _mm256_set1_ps(e1)), f, _mm256_set1_ps(e0)); __m256i ki = _mm256_cvtps_epi32(kv); __m256 pow2_neg_k = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_sub_epi32(_mm256_set1_epi32(127), ki), 23)); __m256 erfc_result = _mm256_sub_ps( _mm256_set1_ps(1.0f), _mm256_mul_ps(_mm256_mul_ps(pow2_neg_k, exp_neg_f), ppoly)); // Blend __m256 use_erfc = _mm256_cmp_ps(ax, _mm256_set1_ps(0.875f), _CMP_GE_OQ); __m256 result = _mm256_blendv_ps(near_zero, erfc_result, use_erfc); __m256 saturated = _mm256_cmp_ps(ax, _mm256_set1_ps(3.92f), _CMP_GE_OQ); result = _mm256_blendv_ps(result, _mm256_set1_ps(1.0f), saturated); return _mm256_or_ps(result, sign); } // --- AVX S21 (process 8 floats via 4x double2) --- static inline __m256 erf_s21_avx(__m256 x) { __m256 ax_f = dfm::fabs(x); __m256 sign = _mm256_and_ps(x, _mm256_set1_ps(-0.0f)); // Split 8 floats into 4 groups of 2 doubles __m128 lo4 = _mm256_castps256_ps128(ax_f); __m128 hi4 = _mm256_extractf128_ps(ax_f, 1); __m128d ax_0 = _mm_cvtps_pd(lo4); __m128d ax_1 = _mm_cvtps_pd(_mm_movehl_ps(lo4, lo4)); __m128d ax_2 = _mm_cvtps_pd(hi4); __m128d ax_3 = _mm_cvtps_pd(_mm_movehl_ps(hi4, hi4)); auto eval_poly = [](__m128d ax) -> __m128d { __m128d u = _mm_mul_pd(ax, ax); constexpr double c0 = 0x1.20dd74ce6dac1p0; constexpr double c1 = -0x1.812728fedb0c3p-2; constexpr double c2 = 0x1.ce2c679f0f94dp-4; constexpr double c3 = -0x1.b81379b046993p-6; constexpr double c4 = 0x1.55decae500c6cp-8; constexpr double c5 = -0x1.bd402ca3b1d09p-11; constexpr double c6 = 0x1.edfbbbd68d00ep-14; constexpr double c7 = -0x1.d43f94bdfb90fp-17; constexpr double c8 = 0x1.77643daca82f5p-20; constexpr double c9 = -0x1.f276d5cf346ecp-24; constexpr double c10 = 0x1.0a42c17eedcadp-27; constexpr double c11 = -0x1.b999e591ae6bap-32; constexpr double c12 = 0x1.0f73303821975p-36; constexpr double c13 = -0x1.ce969a50741b3p-42; constexpr double c14 = 0x1.e56af7f1b38e4p-48; constexpr double c15 = -0x1.d6f65766c68e5p-55; __m128d p0 = _mm_fmadd_pd(_mm_set1_pd(c1), u, _mm_set1_pd(c0)); __m128d p1 = _mm_fmadd_pd(_mm_set1_pd(c3), u, _mm_set1_pd(c2)); __m128d p2 = _mm_fmadd_pd(_mm_set1_pd(c5), u, _mm_set1_pd(c4)); __m128d p3 = _mm_fmadd_pd(_mm_set1_pd(c7), u, _mm_set1_pd(c6)); __m128d p4 = _mm_fmadd_pd(_mm_set1_pd(c9), u, _mm_set1_pd(c8)); __m128d p5 = _mm_fmadd_pd(_mm_set1_pd(c11), u, _mm_set1_pd(c10)); __m128d p6 = _mm_fmadd_pd(_mm_set1_pd(c13), u, _mm_set1_pd(c12)); __m128d p7 = _mm_fmadd_pd(_mm_set1_pd(c15), u, _mm_set1_pd(c14)); __m128d u2 = _mm_mul_pd(u, u); __m128d q0 = _mm_fmadd_pd(p1, u2, p0); __m128d q1 = _mm_fmadd_pd(p3, u2, p2); __m128d q2 = _mm_fmadd_pd(p5, u2, p4); __m128d q3 = _mm_fmadd_pd(p7, u2, p6); __m128d u4 = _mm_mul_pd(u2, u2); __m128d r0 = _mm_fmadd_pd(q1, u4, q0); __m128d r1 = _mm_fmadd_pd(q3, u4, q2); __m128d u8 = _mm_mul_pd(u4, u4); __m128d R = _mm_fmadd_pd(r1, u8, r0); return _mm_mul_pd(ax, R); }; __m128d res_0 = eval_poly(ax_0); __m128d res_1 = eval_poly(ax_1); __m128d res_2 = eval_poly(ax_2); __m128d res_3 = eval_poly(ax_3); // Convert back: 4x double2 -> 2x float4 -> 1x float8 __m128 lo_f = _mm_movelh_ps(_mm_cvtpd_ps(res_0), _mm_cvtpd_ps(res_1)); __m128 hi_f = _mm_movelh_ps(_mm_cvtpd_ps(res_2), _mm_cvtpd_ps(res_3)); __m256 result = _mm256_set_m128(hi_f, lo_f); // Clamp __m256 saturated = _mm256_cmp_ps(ax_f, _mm256_set1_ps(3.92f), _CMP_GE_OQ); result = _mm256_blendv_ps(result, _mm256_set1_ps(1.0f), saturated); return _mm256_or_ps(result, sign); } void BM_erf_s16_avx(benchmark::State& state) { const auto& inputs = erfAvxInputs(); size_t idx = 0; __m256 sum = _mm256_setzero_ps(); for (auto UNUSED_VAR : state) { sum = _mm256_add_ps(sum, erf_s16_avx(inputs[idx])); idx = (idx + 1) & kInputsMask; } state.SetItemsProcessed(state.iterations() * 8); consumeSum256(sum); } void BM_erf_s21_avx(benchmark::State& state) { const auto& inputs = erfAvxInputs(); size_t idx = 0; __m256 sum = _mm256_setzero_ps(); for (auto UNUSED_VAR : state) { sum = _mm256_add_ps(sum, erf_s21_avx(inputs[idx])); idx = (idx + 1) & kInputsMask; } state.SetItemsProcessed(state.iterations() * 8); consumeSum256(sum); } BENCHMARK(BM_erf_s16_avx); BENCHMARK(BM_erf_s21_avx); #endif // __AVX2__ #else // !__SSE4_1__ int main() { std::cout << "SSE4.1 not available, skipping benchmarks." << std::endl; return 0; } #endif // __SSE4_1__ ================================================ FILE: benchmarks/fast_math/hwy_benchmarks.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include "benchmark_helpers.h" #if __has_include("hwy/highway.h") #include "hwy/contrib/math/math-inl.h" namespace dfm = dispenso::fast_math; namespace bench = dispenso::fast_math::bench; namespace hn = hwy::HWY_NAMESPACE; using Flt = dfm::HwyFloat; using HwyFloatTag = dfm::HwyFloatTag; struct BoundsTraits { static constexpr bool kMaxAccuracy = false; static constexpr bool kBoundsValues = true; }; // --- One-arg benchmarks --- void BM_sin_hwy(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](auto x) { return dfm::sin(x); }); } void BM_sin_hwy_accurate(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](auto x) { return dfm::sin(x); }); } void BM_cos_hwy(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](auto x) { return dfm::cos(x); }); } void BM_cos_hwy_accurate(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](auto x) { return dfm::cos(x); }); } void BM_tan_hwy(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](auto x) { return dfm::tan(x); }); } void BM_atan_hwy(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](auto x) { return dfm::atan(x); }); } void BM_acos_hwy(benchmark::State& state) { bench::runBench(state, bench::acosInputs(), [](auto x) { return dfm::acos(x); }); } void BM_asin_hwy(benchmark::State& state) { bench::runBench(state, bench::acosInputs(), [](auto x) { return dfm::asin(x); }); } void BM_exp_hwy(benchmark::State& state) { bench::runBench(state, bench::expInputs(), [](auto x) { return dfm::exp(x); }); } void BM_exp_hwy_accurate(benchmark::State& state) { bench::runBench(state, bench::expInputs(), [](auto x) { return dfm::exp(x); }); } void BM_exp_hwy_bounds(benchmark::State& state) { bench::runBench( state, bench::expInputs(), [](auto x) { return dfm::exp(x); }); } void BM_exp2_hwy(benchmark::State& state) { bench::runBench(state, bench::expInputs(), [](auto x) { return dfm::exp2(x); }); } void BM_exp10_hwy(benchmark::State& state) { bench::runBench(state, bench::expInputs(), [](auto x) { return dfm::exp10(x); }); } void BM_expm1_hwy(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](auto x) { return dfm::expm1(x); }); } void BM_log_hwy(benchmark::State& state) { bench::runBench(state, bench::logInputs(), [](auto x) { return dfm::log(x); }); } void BM_log_hwy_accurate(benchmark::State& state) { bench::runBench(state, bench::logInputs(), [](auto x) { return dfm::log(x); }); } void BM_log2_hwy(benchmark::State& state) { bench::runBench(state, bench::logInputs(), [](auto x) { return dfm::log2(x); }); } void BM_log10_hwy(benchmark::State& state) { bench::runBench(state, bench::logInputs(), [](auto x) { return dfm::log10(x); }); } void BM_log1p_hwy(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](auto x) { return dfm::log1p(hn::Abs(x.v)); }); } void BM_cbrt_hwy(benchmark::State& state) { bench::runBench(state, bench::logInputs(), [](auto x) { return dfm::cbrt(x); }); } void BM_cbrt_hwy_accurate(benchmark::State& state) { bench::runBench(state, bench::logInputs(), [](auto x) { return dfm::cbrt(x); }); } void BM_frexp_hwy(benchmark::State& state) { bench::runBench(state, bench::logInputs(), [](auto x) { dfm::IntType_t e; return dfm::frexp(x, &e); }); } void BM_ldexp_hwy(benchmark::State& state) { bench::runBench(state, bench::logInputs(), [](auto x) { const hn::RebindToSigned di; return dfm::ldexp(x, hn::Set(di, 3)); }); } void BM_tanh_hwy(benchmark::State& state) { bench::runBench(state, bench::tanhInputs(), [](auto x) { return dfm::tanh(x); }); } void BM_erf_hwy(benchmark::State& state) { bench::runBench(state, bench::erfInputs(), [](auto x) { return dfm::erf(x); }); } // --- Two-arg benchmarks --- void BM_atan2_hwy(benchmark::State& state) { bench::runBench2(state, bench::expInputs(), bench::sinInputs(), [](auto y, auto x) { return dfm::atan2(y, x); }); } void BM_hypot_hwy(benchmark::State& state) { bench::runBench2(state, bench::hypotInputs(), bench::sinInputs(), [](auto x, auto y) { return dfm::hypot(x, y); }); } void BM_hypot_hwy_bounds(benchmark::State& state) { bench::runBench2(state, bench::hypotInputs(), bench::sinInputs(), [](auto x, auto y) { return dfm::hypot(x, y); }); } // --- Highway contrib/math comparison benchmarks --- void BM_sin_hwy_contrib(benchmark::State& state) { bench::runBench( state, bench::sinInputs(), [](auto x) { return hn::Sin(HwyFloatTag(), x.v); }); } void BM_cos_hwy_contrib(benchmark::State& state) { bench::runBench( state, bench::sinInputs(), [](auto x) { return hn::Cos(HwyFloatTag(), x.v); }); } void BM_exp_hwy_contrib(benchmark::State& state) { bench::runBench( state, bench::expInputs(), [](auto x) { return hn::Exp(HwyFloatTag(), x.v); }); } void BM_exp2_hwy_contrib(benchmark::State& state) { bench::runBench( state, bench::expInputs(), [](auto x) { return hn::Exp2(HwyFloatTag(), x.v); }); } void BM_log_hwy_contrib(benchmark::State& state) { bench::runBench( state, bench::logInputs(), [](auto x) { return hn::Log(HwyFloatTag(), x.v); }); } void BM_log2_hwy_contrib(benchmark::State& state) { bench::runBench( state, bench::logInputs(), [](auto x) { return hn::Log2(HwyFloatTag(), x.v); }); } void BM_log10_hwy_contrib(benchmark::State& state) { bench::runBench( state, bench::logInputs(), [](auto x) { return hn::Log10(HwyFloatTag(), x.v); }); } void BM_atan_hwy_contrib(benchmark::State& state) { bench::runBench( state, bench::sinInputs(), [](auto x) { return hn::Atan(HwyFloatTag(), x.v); }); } void BM_acos_hwy_contrib(benchmark::State& state) { bench::runBench( state, bench::acosInputs(), [](auto x) { return hn::Acos(HwyFloatTag(), x.v); }); } void BM_asin_hwy_contrib(benchmark::State& state) { bench::runBench( state, bench::acosInputs(), [](auto x) { return hn::Asin(HwyFloatTag(), x.v); }); } void BM_atan2_hwy_contrib(benchmark::State& state) { bench::runBench2(state, bench::expInputs(), bench::sinInputs(), [](auto y, auto x) { return hn::Atan2(HwyFloatTag(), y.v, x.v); }); } // --- Registrations --- BENCHMARK(BM_sin_hwy); BENCHMARK(BM_sin_hwy_accurate); BENCHMARK(BM_sin_hwy_contrib); BENCHMARK(BM_cos_hwy); BENCHMARK(BM_cos_hwy_accurate); BENCHMARK(BM_cos_hwy_contrib); BENCHMARK(BM_tan_hwy); BENCHMARK(BM_atan_hwy); BENCHMARK(BM_atan_hwy_contrib); BENCHMARK(BM_acos_hwy); BENCHMARK(BM_acos_hwy_contrib); BENCHMARK(BM_asin_hwy); BENCHMARK(BM_asin_hwy_contrib); BENCHMARK(BM_exp_hwy); BENCHMARK(BM_exp_hwy_accurate); BENCHMARK(BM_exp_hwy_bounds); BENCHMARK(BM_exp_hwy_contrib); BENCHMARK(BM_exp2_hwy); BENCHMARK(BM_exp2_hwy_contrib); BENCHMARK(BM_exp10_hwy); BENCHMARK(BM_expm1_hwy); BENCHMARK(BM_log_hwy); BENCHMARK(BM_log_hwy_accurate); BENCHMARK(BM_log_hwy_contrib); BENCHMARK(BM_log2_hwy); BENCHMARK(BM_log2_hwy_contrib); BENCHMARK(BM_log10_hwy); BENCHMARK(BM_log10_hwy_contrib); BENCHMARK(BM_log1p_hwy); BENCHMARK(BM_cbrt_hwy); BENCHMARK(BM_cbrt_hwy_accurate); BENCHMARK(BM_frexp_hwy); BENCHMARK(BM_ldexp_hwy); BENCHMARK(BM_tanh_hwy); BENCHMARK(BM_erf_hwy); BENCHMARK(BM_atan2_hwy); BENCHMARK(BM_atan2_hwy_contrib); BENCHMARK(BM_hypot_hwy); BENCHMARK(BM_hypot_hwy_bounds); #else // !__has_include("hwy/highway.h") int main() { std::cout << "Highway not available, skipping benchmarks." << std::endl; return 0; } #endif // __has_include("hwy/highway.h") ================================================ FILE: benchmarks/fast_math/neon_benchmarks.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include "benchmark_helpers.h" #if defined(__aarch64__) namespace dfm = dispenso::fast_math; namespace bench = dispenso::fast_math::bench; using Flt = float32x4_t; // --- One-arg benchmarks --- void BM_sin_neon(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](auto x) { return dfm::sin(x); }); } void BM_cos_neon(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](auto x) { return dfm::cos(x); }); } void BM_tan_neon(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](auto x) { return dfm::tan(x); }); } void BM_atan_neon(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](auto x) { return dfm::atan(x); }); } void BM_acos_neon(benchmark::State& state) { bench::runBench(state, bench::acosInputs(), [](auto x) { return dfm::acos(x); }); } void BM_asin_neon(benchmark::State& state) { bench::runBench(state, bench::acosInputs(), [](auto x) { return dfm::asin(x); }); } void BM_exp_neon(benchmark::State& state) { bench::runBench(state, bench::expInputs(), [](auto x) { return dfm::exp(x); }); } void BM_exp2_neon(benchmark::State& state) { bench::runBench(state, bench::expInputs(), [](auto x) { return dfm::exp2(x); }); } void BM_exp10_neon(benchmark::State& state) { bench::runBench(state, bench::expInputs(), [](auto x) { return dfm::exp10(x); }); } void BM_expm1_neon(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](auto x) { return dfm::expm1(x); }); } void BM_log_neon(benchmark::State& state) { bench::runBench(state, bench::logInputs(), [](auto x) { return dfm::log(x); }); } void BM_log2_neon(benchmark::State& state) { bench::runBench(state, bench::logInputs(), [](auto x) { return dfm::log2(x); }); } void BM_log10_neon(benchmark::State& state) { bench::runBench(state, bench::logInputs(), [](auto x) { return dfm::log10(x); }); } void BM_log1p_neon(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](Flt x) { return dfm::log1p(vabsq_f32(x)); }); } void BM_cbrt_neon(benchmark::State& state) { bench::runBench(state, bench::logInputs(), [](auto x) { return dfm::cbrt(x); }); } void BM_frexp_neon(benchmark::State& state) { bench::runBench(state, bench::logInputs(), [](auto x) { dfm::IntType_t e; return dfm::frexp(x, &e); }); } void BM_ldexp_neon(benchmark::State& state) { bench::runBench( state, bench::logInputs(), [](auto x) { return dfm::ldexp(x, vdupq_n_s32(3)); }); } void BM_tanh_neon(benchmark::State& state) { bench::runBench(state, bench::tanhInputs(), [](auto x) { return dfm::tanh(x); }); } void BM_erf_neon(benchmark::State& state) { bench::runBench(state, bench::erfInputs(), [](auto x) { return dfm::erf(x); }); } // --- Two-arg benchmarks --- void BM_atan2_neon(benchmark::State& state) { bench::runBench2(state, bench::expInputs(), bench::sinInputs(), [](auto y, auto x) { return dfm::atan2(y, x); }); } void BM_hypot_neon(benchmark::State& state) { bench::runBench2(state, bench::hypotInputs(), bench::sinInputs(), [](auto x, auto y) { return dfm::hypot(x, y); }); } void BM_hypot_neon_bounds(benchmark::State& state) { bench::runBench2(state, bench::hypotInputs(), bench::sinInputs(), [](auto x, auto y) { return dfm::hypot(x, y); }); } void BM_pow_neon(benchmark::State& state) { bench::runBench2( state, bench::powBaseInputs(), bench::powExpInputs(), [](auto b, auto e) { return dfm::pow(b, e); }); } void BM_pow_neon_accurate(benchmark::State& state) { bench::runBench2( state, bench::powBaseInputs(), bench::powExpInputs(), [](auto b, auto e) { return dfm::pow(b, e); }); } void BM_pow_neon_scalar_exp(benchmark::State& state) { bench::runBench(state, bench::powBaseInputs(), [](auto x) { return dfm::pow(x, 2.5f); }); } // --- Libc-packed comparison (NEON-specific, kept hand-written) --- void BM_pow_libc_neon(benchmark::State& state) { const auto& bases = bench::powBaseInputs(); const auto& exps = bench::powExpInputs(); size_t idx = 0; Flt sum = vdupq_n_f32(0.0f); for (auto _ : state) { (void)_; alignas(16) float x[4], y[4], r[4]; vst1q_f32(x, bases[idx]); vst1q_f32(y, exps[idx]); for (int32_t i = 0; i < 4; ++i) { r[i] = ::powf(x[i], y[i]); } sum = vaddq_f32(sum, vld1q_f32(r)); idx = (idx + 1) & bench::kInputsMask; } state.SetItemsProcessed(state.iterations() * 4); bench::consumeResult(sum); } // --- Registrations --- BENCHMARK(BM_sin_neon); BENCHMARK(BM_cos_neon); BENCHMARK(BM_tan_neon); BENCHMARK(BM_atan_neon); BENCHMARK(BM_acos_neon); BENCHMARK(BM_asin_neon); BENCHMARK(BM_exp_neon); BENCHMARK(BM_exp2_neon); BENCHMARK(BM_exp10_neon); BENCHMARK(BM_expm1_neon); BENCHMARK(BM_log_neon); BENCHMARK(BM_log2_neon); BENCHMARK(BM_log10_neon); BENCHMARK(BM_log1p_neon); BENCHMARK(BM_cbrt_neon); BENCHMARK(BM_frexp_neon); BENCHMARK(BM_ldexp_neon); BENCHMARK(BM_tanh_neon); BENCHMARK(BM_erf_neon); BENCHMARK(BM_atan2_neon); BENCHMARK(BM_hypot_neon); BENCHMARK(BM_hypot_neon_bounds); BENCHMARK(BM_pow_neon); BENCHMARK(BM_pow_neon_accurate); BENCHMARK(BM_pow_neon_scalar_exp); BENCHMARK(BM_pow_libc_neon); #else // !defined(__aarch64__) int main() { std::cout << "AArch64 NEON not available, skipping benchmarks." << std::endl; return 0; } #endif // defined(__aarch64__) ================================================ FILE: benchmarks/fast_math/sse_benchmarks.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include "benchmark_helpers.h" #if defined(__SSE4_1__) namespace dfm = dispenso::fast_math; namespace bench = dispenso::fast_math::bench; using Flt = __m128; struct BoundsTraits { static constexpr bool kMaxAccuracy = false; static constexpr bool kBoundsValues = true; }; // --- One-arg benchmarks --- void BM_sin_sse(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](auto x) { return dfm::sin(x); }); } void BM_sin_sse_accurate(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](auto x) { return dfm::sin(x); }); } void BM_cos_sse(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](auto x) { return dfm::cos(x); }); } void BM_cos_sse_accurate(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](auto x) { return dfm::cos(x); }); } void BM_tan_sse(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](auto x) { return dfm::tan(x); }); } void BM_atan_sse(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](auto x) { return dfm::atan(x); }); } void BM_acos_sse(benchmark::State& state) { bench::runBench(state, bench::acosInputs(), [](auto x) { return dfm::acos(x); }); } void BM_asin_sse(benchmark::State& state) { bench::runBench(state, bench::acosInputs(), [](auto x) { return dfm::asin(x); }); } void BM_exp_sse(benchmark::State& state) { bench::runBench(state, bench::expInputs(), [](auto x) { return dfm::exp(x); }); } void BM_exp_sse_accurate(benchmark::State& state) { bench::runBench(state, bench::expInputs(), [](auto x) { return dfm::exp(x); }); } void BM_exp_sse_bounds(benchmark::State& state) { bench::runBench( state, bench::expInputs(), [](auto x) { return dfm::exp(x); }); } void BM_exp2_sse(benchmark::State& state) { bench::runBench(state, bench::expInputs(), [](auto x) { return dfm::exp2(x); }); } void BM_exp10_sse(benchmark::State& state) { bench::runBench(state, bench::expInputs(), [](auto x) { return dfm::exp10(x); }); } void BM_expm1_sse(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](auto x) { return dfm::expm1(x); }); } void BM_log_sse(benchmark::State& state) { bench::runBench(state, bench::logInputs(), [](auto x) { return dfm::log(x); }); } void BM_log_sse_accurate(benchmark::State& state) { bench::runBench(state, bench::logInputs(), [](auto x) { return dfm::log(x); }); } void BM_log2_sse(benchmark::State& state) { bench::runBench(state, bench::logInputs(), [](auto x) { return dfm::log2(x); }); } void BM_log10_sse(benchmark::State& state) { bench::runBench(state, bench::logInputs(), [](auto x) { return dfm::log10(x); }); } void BM_log1p_sse(benchmark::State& state) { bench::runBench(state, bench::sinInputs(), [](Flt x) { Flt ax = _mm_andnot_ps(_mm_set1_ps(-0.0f), x); return dfm::log1p(ax); }); } void BM_cbrt_sse(benchmark::State& state) { bench::runBench(state, bench::logInputs(), [](auto x) { return dfm::cbrt(x); }); } void BM_cbrt_sse_accurate(benchmark::State& state) { bench::runBench(state, bench::logInputs(), [](auto x) { return dfm::cbrt(x); }); } void BM_frexp_sse(benchmark::State& state) { bench::runBench(state, bench::logInputs(), [](auto x) { dfm::IntType_t e; return dfm::frexp(x, &e); }); } void BM_ldexp_sse(benchmark::State& state) { bench::runBench( state, bench::logInputs(), [](auto x) { return dfm::ldexp(x, _mm_set1_epi32(3)); }); } void BM_tanh_sse(benchmark::State& state) { bench::runBench(state, bench::tanhInputs(), [](auto x) { return dfm::tanh(x); }); } void BM_erf_sse(benchmark::State& state) { bench::runBench(state, bench::erfInputs(), [](auto x) { return dfm::erf(x); }); } // --- Two-arg benchmarks --- void BM_atan2_sse(benchmark::State& state) { bench::runBench2(state, bench::expInputs(), bench::sinInputs(), [](auto y, auto x) { return dfm::atan2(y, x); }); } void BM_hypot_sse(benchmark::State& state) { bench::runBench2(state, bench::hypotInputs(), bench::sinInputs(), [](auto x, auto y) { return dfm::hypot(x, y); }); } void BM_hypot_sse_bounds(benchmark::State& state) { bench::runBench2(state, bench::hypotInputs(), bench::sinInputs(), [](auto x, auto y) { return dfm::hypot(x, y); }); } void BM_pow_sse(benchmark::State& state) { bench::runBench2( state, bench::powBaseInputs(), bench::powExpInputs(), [](auto b, auto e) { return dfm::pow(b, e); }); } void BM_pow_sse_accurate(benchmark::State& state) { bench::runBench2( state, bench::powBaseInputs(), bench::powExpInputs(), [](auto b, auto e) { return dfm::pow(b, e); }); } void BM_pow_sse_scalar_exp(benchmark::State& state) { bench::runBench(state, bench::powBaseInputs(), [](auto x) { return dfm::pow(x, 2.5f); }); } // --- Libc-packed comparisons (SSE-specific, kept hand-written) --- void BM_hypot_libc(benchmark::State& state) { const auto& inputs = bench::hypotInputs(); const auto& inputs2 = bench::sinInputs(); size_t idx = 0; Flt sum = _mm_setzero_ps(); for (auto _ : state) { (void)_; alignas(16) float x[4], y[4], r[4]; _mm_store_ps(x, inputs[idx]); _mm_store_ps(y, inputs2[idx]); r[0] = ::hypotf(x[0], y[0]); r[1] = ::hypotf(x[1], y[1]); r[2] = ::hypotf(x[2], y[2]); r[3] = ::hypotf(x[3], y[3]); sum = _mm_add_ps(sum, _mm_load_ps(r)); idx = (idx + 1) & bench::kInputsMask; } state.SetItemsProcessed(state.iterations() * 4); bench::consumeResult(sum); } void BM_pow_libc_sse(benchmark::State& state) { const auto& bases = bench::powBaseInputs(); const auto& exps = bench::powExpInputs(); size_t idx = 0; Flt sum = _mm_setzero_ps(); for (auto _ : state) { (void)_; alignas(16) float x[4], y[4], r[4]; _mm_store_ps(x, bases[idx]); _mm_store_ps(y, exps[idx]); r[0] = ::powf(x[0], y[0]); r[1] = ::powf(x[1], y[1]); r[2] = ::powf(x[2], y[2]); r[3] = ::powf(x[3], y[3]); sum = _mm_add_ps(sum, _mm_load_ps(r)); idx = (idx + 1) & bench::kInputsMask; } state.SetItemsProcessed(state.iterations() * 4); bench::consumeResult(sum); } // --- Registrations --- BENCHMARK(BM_sin_sse); BENCHMARK(BM_sin_sse_accurate); BENCHMARK(BM_cos_sse); BENCHMARK(BM_cos_sse_accurate); BENCHMARK(BM_tan_sse); BENCHMARK(BM_atan_sse); BENCHMARK(BM_acos_sse); BENCHMARK(BM_asin_sse); BENCHMARK(BM_exp_sse); BENCHMARK(BM_exp_sse_accurate); BENCHMARK(BM_exp_sse_bounds); BENCHMARK(BM_exp2_sse); BENCHMARK(BM_exp10_sse); BENCHMARK(BM_expm1_sse); BENCHMARK(BM_log_sse); BENCHMARK(BM_log_sse_accurate); BENCHMARK(BM_log2_sse); BENCHMARK(BM_log10_sse); BENCHMARK(BM_log1p_sse); BENCHMARK(BM_cbrt_sse); BENCHMARK(BM_cbrt_sse_accurate); BENCHMARK(BM_frexp_sse); BENCHMARK(BM_ldexp_sse); BENCHMARK(BM_tanh_sse); BENCHMARK(BM_erf_sse); BENCHMARK(BM_atan2_sse); BENCHMARK(BM_hypot_sse); BENCHMARK(BM_hypot_sse_bounds); BENCHMARK(BM_hypot_libc); BENCHMARK(BM_pow_sse); BENCHMARK(BM_pow_sse_accurate); BENCHMARK(BM_pow_sse_scalar_exp); BENCHMARK(BM_pow_libc_sse); #else // !defined(__SSE4_1__) int main() { std::cout << "SSE4.1 not available, skipping benchmarks." << std::endl; return 0; } #endif // defined(__SSE4_1__) ================================================ FILE: benchmarks/for_each_benchmark.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * Benchmarks for dispenso::for_each_n / dispenso::for_each. * Tests scheduling overhead across different container/iterator types: * - vector (random-access) * - deque (random-access) * - list (bidirectional) * - set (bidirectional, const elements) */ #include #include #include #include #include #include "thread_benchmark_common.h" static uint32_t kSeed(8); static constexpr int kSmallSize = 1000; static constexpr int kMediumSize = 1000000; static constexpr int kLargeSize = 100000000; const std::vector& getInputs(int num_elements) { static std::unordered_map> vecs; auto it = vecs.find(num_elements); if (it != vecs.end()) { return it->second; } srand(kSeed); std::vector values; values.reserve(num_elements); for (int i = 0; i < num_elements; ++i) { values.push_back((rand() & 255) - 127); } auto res = vecs.emplace(num_elements, std::move(values)); assert(res.second); return res.first->second; } void checkResults(const std::vector& input, const std::vector& output) { for (size_t i = 0; i < input.size(); ++i) { if (output[i] != input[i] * input[i] - 3 * input[i]) { std::cerr << "FAIL! " << output[i] << " vs " << input[i] * input[i] - 3 * input[i] << std::endl; abort(); } } } template void BM_serial(benchmark::State& state) { std::vector output(num_elements, 0); auto& input = getInputs(num_elements); for (auto UNUSED_VAR : state) { for (size_t i = 0; i < num_elements; ++i) { output[i] = input[i] * input[i] - 3 * input[i]; } } } void BM_for_each_n(benchmark::State& state) { const int num_threads = state.range(0) - 1; const int num_elements = state.range(1); std::vector output(num_elements, 0); dispenso::ThreadPool pool(num_threads); auto& input = getInputs(num_elements); for (auto UNUSED_VAR : state) { dispenso::TaskSet tasks(pool); dispenso::for_each_n( tasks, input.begin(), static_cast(num_elements), [&output, &input](const int& val) { size_t idx = static_cast(&val - input.data()); output[idx] = val * val - 3 * val; }); } checkResults(input, output); } void BM_for_each_n_deque(benchmark::State& state) { const int num_threads = state.range(0) - 1; const int num_elements = state.range(1); auto& input = getInputs(num_elements); std::deque deq(input.begin(), input.end()); dispenso::ThreadPool pool(num_threads); std::atomic sum(0); for (auto UNUSED_VAR : state) { sum.store(0, std::memory_order_relaxed); dispenso::TaskSet tasks(pool); dispenso::for_each_n( tasks, deq.begin(), static_cast(num_elements), [&sum](const int& val) { sum.fetch_add(val * val - 3 * val, std::memory_order_relaxed); }); } benchmark::DoNotOptimize(sum.load()); } void BM_for_each_n_list(benchmark::State& state) { const int num_threads = state.range(0) - 1; const int num_elements = state.range(1); auto& input = getInputs(num_elements); std::list lst(input.begin(), input.end()); dispenso::ThreadPool pool(num_threads); std::atomic sum(0); for (auto UNUSED_VAR : state) { sum.store(0, std::memory_order_relaxed); dispenso::TaskSet tasks(pool); dispenso::for_each_n( tasks, lst.begin(), static_cast(num_elements), [&sum](const int& val) { sum.fetch_add(val * val - 3 * val, std::memory_order_relaxed); }); } benchmark::DoNotOptimize(sum.load()); } void BM_for_each_n_set(benchmark::State& state) { const int num_threads = state.range(0) - 1; const int num_elements = state.range(1); auto& input = getInputs(num_elements); std::set s(input.begin(), input.end()); // set deduplicates, so actual size may be smaller size_t actual_size = s.size(); dispenso::ThreadPool pool(num_threads); std::atomic sum(0); for (auto UNUSED_VAR : state) { sum.store(0, std::memory_order_relaxed); dispenso::TaskSet tasks(pool); dispenso::for_each_n(tasks, s.begin(), actual_size, [&sum](const int& val) { sum.fetch_add(val * val - 3 * val, std::memory_order_relaxed); }); } benchmark::DoNotOptimize(sum.load()); } static void CustomArguments(benchmark::internal::Benchmark* b) { for (int j : {kSmallSize, kMediumSize, kLargeSize}) { for (int i : pow2HalfStepThreads()) { b->Args({i, j}); } } } // Smaller argument set for containers where 100M elements is impractical static void SmallArguments(benchmark::internal::Benchmark* b) { for (int j : {kSmallSize, kMediumSize}) { for (int i : pow2HalfStepThreads()) { b->Args({i, j}); } } } BENCHMARK_TEMPLATE(BM_serial, kSmallSize); BENCHMARK_TEMPLATE(BM_serial, kMediumSize); BENCHMARK_TEMPLATE(BM_serial, kLargeSize); BENCHMARK(BM_for_each_n)->Apply(CustomArguments)->UseRealTime(); BENCHMARK(BM_for_each_n_deque)->Apply(CustomArguments)->UseRealTime(); BENCHMARK(BM_for_each_n_list)->Apply(SmallArguments)->UseRealTime(); BENCHMARK(BM_for_each_n_set)->Apply(SmallArguments)->UseRealTime(); BENCHMARK_MAIN(); ================================================ FILE: benchmarks/for_latency_benchmark.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #if defined(_OPENMP) #include #endif #include #include #if !defined(BENCHMARK_WITHOUT_TBB) #include "tbb/blocked_range.h" #include "tbb/parallel_for.h" #include "tbb_compat.h" #endif // !BENCHMARK_WITHOUT_TBB #include "thread_benchmark_common.h" namespace { using namespace std::chrono_literals; uint32_t kSeed(8); constexpr int kSize = 50000; constexpr auto kSleep = 30ms; } // namespace // Adapted from Google gtest examples // Returns true iff n is a prime number. bool isPrime(int n) { // Trivial case 1: small numbers if (n <= 1) return false; // Trivial case 2: even numbers if (n % 2 == 0) return n == 2; // Now, we have that n is odd and n >= 3. // Try to divide n by every odd number i, starting from 3 for (int i = 3;; i += 2) { // We only have to try i up to the squre root of n if (i > n / i) break; // Now, we have i <= n/i < n. // If n is divisible by i, n is not prime. if (n % i == 0) return false; } // n has no integer factor in the range (1, n), and thus is prime. return true; } const std::vector& getInputs(int numElements) { static std::unordered_map> vecs; auto it = vecs.find(numElements); if (it != vecs.end()) { return it->second; } std::mt19937_64 gen64(kSeed); std::uniform_int_distribution<> distribution(100000, 1000000); std::vector values; values.reserve(numElements); for (int i = 0; i < numElements; ++i) { values.push_back(distribution(gen64)); } auto res = vecs.emplace(numElements, std::move(values)); assert(res.second); return res.first->second; } void BM_serial(benchmark::State& state) { std::vector output(kSize, 0); auto& input = getInputs(kSize); std::vector times; times.reserve(1000); for (auto UNUSED_VAR : state) { std::this_thread::sleep_for(kSleep); times.push_back(dispenso::getTime()); for (size_t i = 0; i < kSize; ++i) { output[i] = isPrime(input[i]); } times.back() = dispenso::getTime() - times.back(); } doStats(times, state); } void BM_dispenso(benchmark::State& state) { const int numThreads = state.range(0) - 1; std::vector output(kSize, 0); dispenso::resizeGlobalThreadPool(numThreads); std::vector times; times.reserve(1000); auto& input = getInputs(kSize); for (auto UNUSED_VAR : state) { std::this_thread::sleep_for(kSleep); times.push_back(dispenso::getTime()); dispenso::parallel_for( dispenso::makeChunkedRange(0, kSize), [&input, &output](size_t i, size_t e) { for (; i != e; ++i) { output[i] = isPrime(input[i]); } }); times.back() = dispenso::getTime() - times.back(); } doStats(times, state); } #if defined(_OPENMP) void BM_omp(benchmark::State& state) { const int numThreads = state.range(0); std::vector output(kSize, 0); omp_set_num_threads(numThreads); std::vector times; times.reserve(1000); auto& input = getInputs(kSize); for (auto UNUSED_VAR : state) { std::this_thread::sleep_for(kSleep); times.push_back(dispenso::getTime()); #pragma omp parallel for for (int i = 0; i < kSize; ++i) { output[i] = isPrime(input[i]); } times.back() = dispenso::getTime() - times.back(); } doStats(times, state); } #endif /*defined(_OPENMP)*/ #if !defined(BENCHMARK_WITHOUT_TBB) void BM_tbb(benchmark::State& state) { const int numThreads = state.range(0); std::vector output(kSize, 0); tbb_compat::task_scheduler_init initsched(numThreads); std::vector times; times.reserve(1000); auto& input = getInputs(kSize); for (auto UNUSED_VAR : state) { std::this_thread::sleep_for(kSleep); times.push_back(dispenso::getTime()); tbb::parallel_for( tbb::blocked_range(0, kSize), [&input, &output](const tbb::blocked_range& r) { for (size_t i = r.begin(); i < r.end(); ++i) { output[i] = isPrime(input[i]); } }); times.back() = dispenso::getTime() - times.back(); } doStats(times, state); } #endif // !BENCHMARK_WITHOUT_TBB static void CustomArguments(benchmark::internal::Benchmark* b) { for (int i : pow2HalfStepThreads()) { b->Arg(i); } } BENCHMARK(BM_serial)->UseRealTime(); #if defined(_OPENMP) BENCHMARK(BM_omp)->Apply(CustomArguments)->UseRealTime(); #endif // OPENMP #if !defined(BENCHMARK_WITHOUT_TBB) BENCHMARK(BM_tbb)->Apply(CustomArguments)->UseRealTime(); #endif // !BENCHMARK_WITHOUT_TBB BENCHMARK(BM_dispenso)->Apply(CustomArguments)->UseRealTime(); BENCHMARK_MAIN(); ================================================ FILE: benchmarks/future_benchmark.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #if !defined(BENCHMARK_WITHOUT_FOLLY) #include #include #endif // !BENCHMARK_WITHOUT_FOLLY #include "thread_benchmark_common.h" constexpr size_t kSmallSize = 13; constexpr size_t kMediumSize = 16; constexpr size_t kLargeSize = 19; // Note that there are many optimizations that could be made for these tree build routines. The // goal was to make these as apples-to-apples as possible. struct Node { Node* left; Node* right; uint32_t value; void setValue(uint32_t unique_bitset, uint32_t modulo) { value = 0; for (uint32_t i = 0; i < 32; ++i) { value += unique_bitset % modulo; unique_bitset /= modulo; } } }; class Allocator { public: void reset(size_t depth) { nodes_.resize(std::pow(2, depth) - 1); next_.store(0, std::memory_order_release); } Node* alloc() { size_t cur = next_.fetch_add(1, std::memory_order_relaxed); return &nodes_[cur]; } private: std::vector nodes_; std::atomic next_{0}; }; const std::vector& getModulos() { static const std::vector modulos = []() { std::mt19937 mt; std::uniform_int_distribution<> dis(2, 55); std::vector m; for (size_t i = 0; i < 64; ++i) { m.emplace_back(dis(mt)); } return m; }(); return modulos; } uint64_t sumTree(Node* root) { if (!root) { return 0; } return root->value + sumTree(root->left) + sumTree(root->right); } void checkTree(Node* root, uint32_t depth, uint32_t modulo) { uint64_t expectedSum = 0; uint32_t num = std::pow(2, depth); for (uint32_t i = 0; i < num; ++i) { auto bitset = i; while (bitset) { expectedSum += bitset % modulo; bitset /= modulo; } } uint64_t actual = sumTree(root); if (actual != expectedSum) { std::cerr << "Mismatch! " << expectedSum << " vs " << actual << std::endl; std::abort(); } } Node* serialTree(Allocator& allocator, uint32_t depth, uint32_t bitset, uint32_t modulo) { --depth; Node* node = allocator.alloc(); node->setValue(bitset, modulo); if (!depth) { node->left = nullptr; node->right = nullptr; return node; } node->left = serialTree(allocator, depth, (bitset << 1), modulo); node->right = serialTree(allocator, depth, (bitset << 1) | 1, modulo); return node; } template void BM_serial_tree(benchmark::State& state) { Allocator alloc; alloc.reset(depth); getModulos(); uint32_t modulo; Node* root; size_t m = 0; for (auto UNUSED_VAR : state) { alloc.reset(depth); modulo = getModulos()[m]; root = serialTree(alloc, depth, 1, modulo); m = (m + 1 == getModulos().size()) ? 0 : m + 1; } checkTree(root, depth, modulo); } Node* stdTree(Allocator& allocator, uint32_t depth, uint32_t bitset, uint32_t modulo) { --depth; Node* node = allocator.alloc(); node->setValue(bitset, modulo); if (!depth) { node->left = nullptr; node->right = nullptr; return node; } auto left = std::async([&]() { return stdTree(allocator, depth, (bitset << 1), modulo); }); auto right = std::async([&]() { return stdTree(allocator, depth, (bitset << 1) | 1, modulo); }); node->left = left.get(); node->right = right.get(); return node; } template void BM_std_tree(benchmark::State& state) { #if defined(__APPLE__) || defined(__ANDROID__) // std::async on macOS and Android (libc++) creates a real thread per call. // At depth 16+ this spawns tens of thousands of threads recursively, // exhausting OS thread limits and hanging the process. if (depth >= kMediumSize) { state.SkipWithError("std::async hangs on this platform at this tree depth"); return; } #elif defined(_WIN32) // std::async on Windows uses a threadpool. The recursive tree construction // pattern causes deadlock as threadpool threads wait for tasks that require // more threadpool threads. Skip all depths on Windows. state.SkipWithError("std::async deadlocks on Windows threadpool"); return; #endif Allocator alloc; alloc.reset(depth); getModulos(); uint32_t modulo; Node* root; size_t m = 0; for (auto UNUSED_VAR : state) { alloc.reset(depth); modulo = getModulos()[m]; root = stdTree(alloc, depth, 1, modulo); m = (m + 1 == getModulos().size()) ? 0 : m + 1; } checkTree(root, depth, modulo); } Node* dispensoTree(Allocator& allocator, uint32_t depth, uint32_t bitset, uint32_t modulo) { --depth; Node* node = allocator.alloc(); node->setValue(bitset, modulo); if (!depth) { node->left = nullptr; node->right = nullptr; return node; } auto left = dispenso::async( [=, &allocator]() { return dispensoTree(allocator, depth, (bitset << 1), modulo); }); auto right = dispenso::async( [=, &allocator]() { return dispensoTree(allocator, depth, (bitset << 1) | 1, modulo); }); node->left = left.get(); node->right = right.get(); return node; } template void BM_dispenso_tree(benchmark::State& state) { Allocator alloc; alloc.reset(depth); getModulos(); dispenso::globalThreadPool(); uint32_t modulo; Node* root; size_t m = 0; for (auto UNUSED_VAR : state) { alloc.reset(depth); modulo = getModulos()[m]; root = dispensoTree(alloc, depth, 1, modulo); m = (m + 1 == getModulos().size()) ? 0 : m + 1; } checkTree(root, depth, modulo); } #if !defined(BENCHMARK_WITHOUT_FOLLY) folly::SemiFuture follyTree( folly::Executor* exec, Node* node, Allocator* allocator, uint32_t depth, uint32_t bitset, uint32_t modulo) { --depth; node->setValue(bitset, modulo); if (!depth) { node->left = nullptr; node->right = nullptr; return folly::Unit{}; } node->left = allocator->alloc(); node->right = allocator->alloc(); return folly::via( exec, [=]() { return folly::collectAll( follyTree(exec, node->left, allocator, depth, bitset << 1, modulo), follyTree(exec, node->right, allocator, depth, bitset << 1 | 1, modulo)) .unit(); }) .semi(); } template void BM_folly_tree(benchmark::State& state) { folly::CPUThreadPoolExecutor follyExec{std::thread::hardware_concurrency()}; Allocator alloc; alloc.reset(depth); uint32_t modulo; Node root; size_t m = 0; for (auto UNUSED_VAR : state) { alloc.reset(depth); modulo = getModulos()[m]; follyTree(&follyExec, &root, &alloc, depth, 1, modulo).via(&follyExec).get(); m = (m + 1 == getModulos().size()) ? 0 : m + 1; } checkTree(&root, depth, modulo); } #endif // !BENCHMARK_WITHOUT_FOLLY void dispensoTaskSetTree( dispenso::ConcurrentTaskSet& tasks, Node* node, Allocator& allocator, uint32_t depth, uint32_t bitset, uint32_t modulo) { node->setValue(bitset, modulo); --depth; if (!depth) { node->left = nullptr; node->right = nullptr; return; } tasks.schedule( [&tasks, &allocator, node, depth, bitset, modulo]() { node->left = allocator.alloc(); dispensoTaskSetTree(tasks, node->left, allocator, depth, (bitset << 1), modulo); }, true); tasks.schedule( [&tasks, &allocator, node, depth, bitset, modulo]() { node->right = allocator.alloc(); dispensoTaskSetTree(tasks, node->right, allocator, depth, (bitset << 1) | 1, modulo); }, true); } template void BM_dispenso_taskset_tree(benchmark::State& state) { Allocator alloc; alloc.reset(depth); getModulos(); uint32_t modulo; Node root; dispenso::ConcurrentTaskSet tasks( dispenso::globalThreadPool(), dispenso::ParentCascadeCancel::kOff, 2); size_t m = 0; for (auto UNUSED_VAR : state) { alloc.reset(depth); modulo = getModulos()[m]; dispensoTaskSetTree(tasks, &root, alloc, depth, 1, modulo); tasks.wait(); m = (m + 1 == getModulos().size()) ? 0 : m + 1; } checkTree(&root, depth, modulo); } void dispensoTaskSetTreeBulk( dispenso::ConcurrentTaskSet& tasks, Node* node, Allocator& allocator, uint32_t depth, uint32_t bitset, uint32_t modulo) { node->setValue(bitset, modulo); --depth; if (!depth) { node->left = nullptr; node->right = nullptr; return; } std::array children = {&node->left, &node->right}; tasks.scheduleBulk(2, [&tasks, &allocator, children, depth, bitset, modulo](size_t i) { return [&tasks, &allocator, child = children[i], depth, bitset = (bitset << 1) | static_cast(i), modulo]() { *child = allocator.alloc(); dispensoTaskSetTreeBulk(tasks, *child, allocator, depth, bitset, modulo); }; }); } template void BM_dispenso_taskset_tree_bulk(benchmark::State& state) { Allocator alloc; alloc.reset(depth); getModulos(); uint32_t modulo; Node root; dispenso::ConcurrentTaskSet tasks( dispenso::globalThreadPool(), dispenso::ParentCascadeCancel::kOff, 2); size_t m = 0; for (auto UNUSED_VAR : state) { alloc.reset(depth); modulo = getModulos()[m]; dispensoTaskSetTreeBulk(tasks, &root, alloc, depth, 1, modulo); tasks.wait(); m = (m + 1 == getModulos().size()) ? 0 : m + 1; } checkTree(&root, depth, modulo); } dispenso::Future dispensoTreeWhenAll(Allocator& allocator, uint32_t depth, uint32_t bitset, uint32_t modulo) { --depth; Node* node = allocator.alloc(); node->setValue(bitset, modulo); if (!depth) { node->left = nullptr; node->right = nullptr; return dispenso::make_ready_future(node); } auto left = dispenso::async([depth, bitset, modulo, &allocator]() { return dispensoTreeWhenAll(allocator, depth, (bitset << 1), modulo); }); auto right = dispenso::async([depth, bitset, modulo, &allocator]() { return dispensoTreeWhenAll(allocator, depth, (bitset << 1) | 1, modulo); }); return dispenso::when_all(left, right).then([node](auto&& both) { auto& tuple = both.get(); node->left = std::get<0>(tuple).get().get(); node->right = std::get<1>(tuple).get().get(); return node; }); } template void BM_dispenso_tree_when_all(benchmark::State& state) { Allocator alloc; alloc.reset(depth); getModulos(); dispenso::globalThreadPool(); uint32_t modulo; Node* root; size_t m = 0; for (auto UNUSED_VAR : state) { alloc.reset(depth); modulo = getModulos()[m]; root = dispensoTreeWhenAll(alloc, depth, 1, modulo).get(); m = (m + 1 == getModulos().size()) ? 0 : m + 1; } checkTree(root, depth, modulo); } BENCHMARK_TEMPLATE(BM_serial_tree, kSmallSize)->UseRealTime(); BENCHMARK_TEMPLATE(BM_serial_tree, kMediumSize)->UseRealTime(); BENCHMARK_TEMPLATE(BM_serial_tree, kLargeSize)->UseRealTime(); BENCHMARK_TEMPLATE(BM_std_tree, kSmallSize)->UseRealTime(); BENCHMARK_TEMPLATE(BM_std_tree, kMediumSize)->UseRealTime(); BENCHMARK_TEMPLATE(BM_std_tree, kLargeSize)->UseRealTime(); #if !defined(BENCHMARK_WITHOUT_FOLLY) BENCHMARK_TEMPLATE(BM_folly_tree, kSmallSize)->UseRealTime(); BENCHMARK_TEMPLATE(BM_folly_tree, kMediumSize)->UseRealTime(); BENCHMARK_TEMPLATE(BM_folly_tree, kLargeSize)->UseRealTime(); #endif // !BENCHMARK_WITHOUT_FOLLY BENCHMARK_TEMPLATE(BM_dispenso_tree, kSmallSize)->UseRealTime(); BENCHMARK_TEMPLATE(BM_dispenso_tree, kMediumSize)->UseRealTime(); BENCHMARK_TEMPLATE(BM_dispenso_tree, kLargeSize)->UseRealTime(); BENCHMARK_TEMPLATE(BM_dispenso_taskset_tree, kSmallSize)->UseRealTime(); BENCHMARK_TEMPLATE(BM_dispenso_taskset_tree, kMediumSize)->UseRealTime(); BENCHMARK_TEMPLATE(BM_dispenso_taskset_tree, kLargeSize)->UseRealTime(); BENCHMARK_TEMPLATE(BM_dispenso_taskset_tree_bulk, kSmallSize)->UseRealTime(); BENCHMARK_TEMPLATE(BM_dispenso_taskset_tree_bulk, kMediumSize)->UseRealTime(); BENCHMARK_TEMPLATE(BM_dispenso_taskset_tree_bulk, kLargeSize)->UseRealTime(); BENCHMARK_TEMPLATE(BM_dispenso_tree_when_all, kSmallSize)->UseRealTime(); BENCHMARK_TEMPLATE(BM_dispenso_tree_when_all, kMediumSize)->UseRealTime(); BENCHMARK_TEMPLATE(BM_dispenso_tree_when_all, kLargeSize)->UseRealTime(); BENCHMARK_MAIN(); ================================================ FILE: benchmarks/graph_benchmark.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include //-------------------------------------------------------------------------------- // Taskflow //-------------------------------------------------------------------------------- class TaskFlowBigTree { public: static size_t sizeOfLevel(size_t level) { return 1ul << (numBits_ - shiftStep_ * level); } void buildTree() { for (size_t level = 1; level < numLevels_; ++level) { buildLevel(level); } } void allocateMemory() { // the goal of this task is to calculate the sum of the numbers in this array for (size_t level = 1; level < numLevels_; ++level) { const size_t s = sizeOfLevel(level); data_[level].resize(s, 0lu); } // zero level data is input data for calculation data_[0].resize(1ul << numBits_); std::iota(data_[0].begin(), data_[0].end(), 0); } void initData() { for (size_t level = 1; level < numLevels_; ++level) { std::fill(data_[level].begin(), data_[level].end(), 0.f); } std::iota(data_[0].begin(), data_[0].end(), 0); } void buildLevel(size_t level) { std::vector& tasks = levelTasks_[level]; const size_t numNodes = 1ul << (numBits_ - shiftStep_ * level); for (size_t n = 0; n < numNodes; ++n) { tasks.push_back(taskflow_.emplace([this, level, n]() { for (size_t j = 0; j < numPredecessors_; ++j) { const size_t index = (n << shiftStep_) | j; data_[level][n] += data_[level - 1][index]; } })); if (level > 1) { for (size_t j = 0; j < numPredecessors_; ++j) { const size_t index = (n << shiftStep_) | j; levelTasks_[level][n].succeed(levelTasks_[level - 1][index]); } } } } bool testTree() { const size_t result = std::accumulate(this->data_[0].begin(), this->data_[0].end(), size_t(0)); executor_.run(taskflow_).wait(); const bool isTestPass = data_[numLevels_ - 1][0] == result; initData(); return isTestPass; } static constexpr size_t shiftStep_ = 4; // 4 static constexpr size_t numBits_ = 5 * shiftStep_; // 5 // should be divisible by shiftStep static constexpr size_t numPredecessors_ = 1 << shiftStep_; static constexpr size_t startShnft_ = shiftStep_; static constexpr size_t numLevels_ = numBits_ / shiftStep_ + 1; static constexpr size_t level0Size_ = 1ul << numBits_; std::array, numLevels_> data_; std::array, numLevels_> levelTasks_; tf::Taskflow taskflow_; tf::Executor executor_; }; //-------------------------------------------------------------------------------- // dispenso::Graph //-------------------------------------------------------------------------------- template class BigTree { public: static size_t sizeOfLevel(size_t level) { return 1ul << (numBits_ - shiftStep_ * level); } void buildTree() { for (size_t level = 1; level < numLevels_; ++level) { subgraphs_[level] = &g_.addSubgraph(); buildLevel(level); } } void allocateMemory() { // the goal of this task is to calculate the sum of the numbers in this array for (size_t level = 1; level < numLevels_; ++level) { const size_t s = sizeOfLevel(level); data_[level].resize(s, 0lu); } // zero level data is input data for calculation data_[0].resize(1ul << numBits_); std::iota(data_[0].begin(), data_[0].end(), 0); } void initData() { for (size_t level = 1; level < numLevels_; ++level) { std::fill(data_[level].begin(), data_[level].end(), 0.f); } std::iota(data_[0].begin(), data_[0].end(), 0); } void buildLevel(size_t level) { SubGraphType* subgraph = subgraphs_[level]; const size_t numNodes = 1ul << (numBits_ - shiftStep_ * level); subgraph->reserve(numNodes); for (size_t n = 0; n < numNodes; ++n) { subgraph->addNode([this, level, n]() { for (size_t j = 0; j < numPredecessors_; ++j) { const size_t index = (n << shiftStep_) | j; data_[level][n] += data_[level - 1][index]; } }); if (level > 1) { for (size_t j = 0; j < numPredecessors_; ++j) { const size_t index = (n << shiftStep_) | j; subgraphs_[level]->node(n).dependsOn(subgraphs_[level - 1]->node(index)); } } } } bool testTree() { const size_t result = std::accumulate(this->data_[0].begin(), this->data_[0].end(), size_t(0)); dispenso::ThreadPool& threadPool = dispenso::globalThreadPool(); dispenso::ConcurrentTaskSet concurrentTaskSet(threadPool); setAllNodesIncomplete(g_); executor_(concurrentTaskSet, g_); concurrentTaskSet.wait(); const bool isTestPass = data_[numLevels_ - 1][0] == result; initData(); return isTestPass; } static constexpr size_t shiftStep_ = 4; // 4 static constexpr size_t numBits_ = 5 * shiftStep_; // 5 // should be divisible by shiftStep static constexpr size_t numPredecessors_ = 1 << shiftStep_; static constexpr size_t startShnft_ = shiftStep_; static constexpr size_t numLevels_ = numBits_ / shiftStep_ + 1; static constexpr size_t level0Size_ = 1ul << numBits_; using SubGraphType = typename G::SubgraphType; std::array, numLevels_> data_; std::array subgraphs_; G g_; dispenso::ConcurrentTaskSetExecutor executor_; }; static void BM_taskflow_build_big_tree(benchmark::State& state) { TaskFlowBigTree bigTree; bigTree.allocateMemory(); for (auto _ : state) { bigTree.buildTree(); assert(bigTree.testTree()); bigTree.taskflow_.clear(); for (uint32_t i = 0; i < TaskFlowBigTree::numLevels_; ++i) { bigTree.levelTasks_[i].clear(); } } } template static void BM_build_big_tree(benchmark::State& state) { BigTree bigTree; bigTree.allocateMemory(); for (auto _ : state) { bigTree.buildTree(); assert(bigTree.testTree()); bigTree.g_.clear(); } } static void BM_build_bi_prop_dependency_chain(benchmark::State& state) { size_t counter; for (auto _ : state) { dispenso::BiPropGraph graph; dispenso::BiPropNode* prevNode = &graph.addNode([&counter]() { counter++; }); for (size_t i = 1; i < 1024; ++i) { dispenso::BiPropNode* node = &graph.addNode([&counter]() { counter++; }); prevNode->biPropDependsOn(*node); prevNode = node; } } } template static void BM_build_dependency_chain(benchmark::State& state) { size_t counter; for (auto _ : state) { G graph; typename G::NodeType* prevNode = &graph.addNode([&counter]() { counter++; }); for (size_t i = 1; i < 1024; ++i) { typename G::NodeType* node = &graph.addNode([&counter]() { counter++; }); prevNode->dependsOn(*node); prevNode = node; } } } template static void BM_execute_dependency_chain(benchmark::State& state) { size_t counter; G graph; typename G::NodeType* prevNode = &graph.addNode([&counter]() { counter++; }); for (size_t i = 1; i < 1024; ++i) { typename G::NodeType* node = &graph.addNode([&counter]() { counter++; }); prevNode->dependsOn(*node); prevNode = node; } dispenso::SingleThreadExecutor singleThreadExecutor; for (auto _ : state) { setAllNodesIncomplete(graph); counter = 0; singleThreadExecutor(graph); } } static void BM_build_bi_prop_dependency_group(benchmark::State& state) { size_t counter; constexpr size_t numNodes = 4; std::array nodes1, nodes2; for (auto _ : state) { dispenso::BiPropGraph graph; for (size_t i = 0; i < numNodes; ++i) { nodes1[i] = &graph.addNode([&counter]() { counter++; }); nodes2[i] = &graph.addNode([&counter]() { counter++; }); nodes1[i]->biPropDependsOn(*nodes2[i]); } for (size_t i = 1; i < numNodes; ++i) { nodes1[i - 1]->biPropDependsOn(*nodes1[i]); } } } template static void BM_forward_propagator_node(benchmark::State& state) { G graph; constexpr size_t numNodes = 32768; std::array nodes; size_t counter; std::mt19937 rng(12345); nodes[0] = &graph.addNode([&counter]() { counter++; }); // root for (size_t i = 1; i < numNodes; ++i) { nodes[i] = &graph.addNode([&counter]() { counter++; }); std::uniform_int_distribution<> parentDistr(0, i - 1); nodes[i]->dependsOn(*nodes[parentDistr(rng)]); } dispenso::ForwardPropagator forwardPropagator; for (auto _ : state) { state.PauseTiming(); graph.forEachNode([](const dispenso::Node& node) { node.setCompleted(); }); nodes[0]->setIncomplete(); state.ResumeTiming(); forwardPropagator(graph); } } BENCHMARK(BM_taskflow_build_big_tree); BENCHMARK_TEMPLATE(BM_build_big_tree, dispenso::Graph); BENCHMARK_TEMPLATE(BM_build_big_tree, dispenso::BiPropGraph); BENCHMARK(BM_build_bi_prop_dependency_chain); BENCHMARK(BM_build_bi_prop_dependency_group); BENCHMARK_TEMPLATE(BM_build_dependency_chain, dispenso::Graph); BENCHMARK_TEMPLATE(BM_build_dependency_chain, dispenso::BiPropGraph); BENCHMARK_TEMPLATE(BM_execute_dependency_chain, dispenso::Graph); BENCHMARK_TEMPLATE(BM_execute_dependency_chain, dispenso::BiPropGraph); BENCHMARK_TEMPLATE(BM_forward_propagator_node, dispenso::Graph); BENCHMARK_TEMPLATE(BM_forward_propagator_node, dispenso::BiPropGraph); BENCHMARK_MAIN(); ================================================ FILE: benchmarks/graph_scene_benchmark.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #if TF_VERSION > 300000 #include #endif // TF_VERSION #include #include #include // For this benchmarks we create set of tasks similar to the scene graph. // Scene consist of the hierarchy of the Transforms which have local space transformation matrix. // World matrix should be calculated as multiplication of the parent world matrix and local space // matrix of the transform. Some transforms have two geometry index. inGeoIdex - is index in source // geometry array (which generated by functions). This geometry should be transformed by worldMatrix // and stored into outGeo array. // generateGeo─┐ // │ // 4 │ // transform 2 ◀──────┐ transform 5 ┌─ ─┬──▼─────┬─ ─┐ // ┌─────────────┐ │ ┌───────────────┐ inGeo: │ ... │ points │ ... │ // │ parent │ └────parent 2│ └─ ─┴──┬─────┴─ ─┘ // │ matrix │ ┌──◀───matrix │ │ // │ worldMatrix─────*────▶ worldMatrix │───────▶────────────* transformGeo // │ inGeoIndex │ │ inGeoIndex 4│ 9 │ // │ outGeoIndex │ │ outGeoIndex 9│ ┌── ─┬──▼─────┬─ ─┐ // └─────────────┘ └───────────────┘ outGeo: │ ... │ points │ ... │ // calculateWorldMatrix └── ─┴────────┴─ ─┘ namespace { namespace params { // parameters of the scene constexpr size_t numTransforms = 1000; // number transforms in hierarchy constexpr size_t numInGeo = 100; // number unique geometries constexpr size_t numVertexMultiplier = 1024; constexpr size_t everyNthHasGeo = 2; // probability that transform has geo is 1/everyNthHasGeo } // namespace params using Vec3 = std::array; using Matrix4 = std::array; constexpr size_t kNoGeometry = std::numeric_limits::max(); constexpr float kPi = 3.14159265358979323846f; constexpr size_t kRoot = std::numeric_limits::max(); struct Transform { Matrix4 matrix; Matrix4 worldMatrix; size_t parent = kRoot; size_t inGeoIndex = kNoGeometry; size_t outGeoIndex = kNoGeometry; }; using Geometry = std::vector; struct Scene { std::vector transforms; std::vector inGeo; std::vector outGeo; }; //-------------------------------------------------------------------------------- void branchlessONB(const Vec3& n, Vec3& b1, Vec3& b2) { const float sign = std::copysign(1.f, n[2]); const float a = -1.0f / (sign + n[2]); const float b = n[0] * n[1] * a; b1 = Vec3{1.0f + sign * n[0] * n[0] * a, sign * b, -sign * n[0]}; b2 = Vec3{b, sign + n[1] * n[1] * a, -n[1]}; } Matrix4 getRandomTransformMatrix(std::mt19937& rng) { std::uniform_real_distribution thetaDistr(0.f, 2.f * kPi); std::uniform_real_distribution uDistr(-1.f, 1.f); const float theta = thetaDistr(rng); const float u = uDistr(rng); const float sq = std::sqrt(1 - u * u); const Vec3 dirY = {sq * std::cos(theta), sq * std::sin(theta), u}; Vec3 dirX, dirZ; branchlessONB(dirY, dirZ, dirX); const Vec3 pos{uDistr(rng), uDistr(rng), uDistr(rng)}; // clang-format off return {dirX[0], dirX[1], dirX[2], 0.f, dirY[0], dirY[1], dirY[2], 0.f, dirZ[0], dirZ[1], dirZ[2], 0.f, pos[0], pos[1], pos[2], 1.f }; // clang-format on } Scene generateTransformsHierarchy( size_t numTransforms, size_t numInGeo, size_t everyNthHasGeo, std::mt19937& rng) { std::uniform_int_distribution distGeom(0, numInGeo - 1); std::uniform_int_distribution distProb(0, everyNthHasGeo - 1); Scene scene; scene.inGeo.resize(numInGeo); std::vector& transforms = scene.transforms; transforms.resize(numTransforms); transforms[0].matrix = getRandomTransformMatrix(rng); transforms[0].worldMatrix = transforms[0].matrix; size_t outGeoCounter = 0; for (size_t i = 1; i < numTransforms; ++i) { Transform& transform = transforms[i]; std::uniform_int_distribution dist(0, i - 1); transform.parent = dist(rng); transform.matrix = getRandomTransformMatrix(rng); if (distProb(rng) == 0) { transform.inGeoIndex = distGeom(rng); transform.outGeoIndex = outGeoCounter++; } } scene.outGeo.resize(outGeoCounter); return scene; } //--------------------------------compute functions------------------------------- Vec3 multiply(const Vec3& v, const Matrix4& m) { const float invertW = 1.f / (m[11] * v[2] + m[7] * v[1] + m[3] * v[0] + m[15]); return { (m[8] * v[2] + m[4] * v[1] + m[0] * v[0] + m[12]) * invertW, (m[9] * v[2] + m[5] * v[1] + m[1] * v[0] + m[13]) * invertW, (m[10] * v[2] + m[6] * v[1] + m[2] * v[0] + m[14]) * invertW}; } Matrix4 multiply(const Matrix4& ma, const Matrix4& mb) { return { ma[2] * mb[8] + ma[1] * mb[4] + ma[3] * mb[12] + ma[0] * mb[0], ma[2] * mb[9] + ma[1] * mb[5] + ma[3] * mb[13] + ma[0] * mb[1], ma[1] * mb[6] + ma[0] * mb[2] + ma[3] * mb[14] + ma[2] * mb[10], ma[1] * mb[7] + ma[0] * mb[3] + ma[3] * mb[15] + ma[2] * mb[11], ma[6] * mb[8] + ma[5] * mb[4] + ma[7] * mb[12] + ma[4] * mb[0], ma[6] * mb[9] + ma[5] * mb[5] + ma[7] * mb[13] + ma[4] * mb[1], ma[5] * mb[6] + ma[4] * mb[2] + ma[7] * mb[14] + ma[6] * mb[10], ma[5] * mb[7] + ma[4] * mb[3] + ma[7] * mb[15] + ma[6] * mb[11], ma[10] * mb[8] + ma[9] * mb[4] + ma[11] * mb[12] + ma[8] * mb[0], ma[10] * mb[9] + ma[9] * mb[5] + ma[11] * mb[13] + ma[8] * mb[1], ma[9] * mb[6] + ma[8] * mb[2] + ma[11] * mb[14] + ma[10] * mb[10], ma[9] * mb[7] + ma[8] * mb[3] + ma[11] * mb[15] + ma[10] * mb[11], ma[14] * mb[8] + ma[13] * mb[4] + ma[15] * mb[12] + ma[12] * mb[0], ma[14] * mb[9] + ma[13] * mb[5] + ma[15] * mb[13] + ma[12] * mb[1], ma[13] * mb[6] + ma[12] * mb[2] + ma[15] * mb[14] + ma[14] * mb[10], ma[13] * mb[7] + ma[12] * mb[3] + ma[15] * mb[15] + ma[14] * mb[11]}; } void calculateWorldMatrix(std::vector& transforms, size_t index) { Transform& transform = transforms[index]; if (transform.parent == kRoot) { transform.worldMatrix = transform.matrix; } else { transform.worldMatrix = multiply(transform.matrix, transforms[transform.parent].worldMatrix); } } size_t numGeoPoints(size_t inGeoIndex) { return (inGeoIndex + 1) * params::numVertexMultiplier; } Vec3 calculateGeoPoint(size_t inGeoIndex, size_t pointIndex) { size_t numPoints = numGeoPoints(inGeoIndex); const float tStep = 2.f * 2.f * kPi / (static_cast(numPoints) - 1.f); const float t = tStep * static_cast(pointIndex); const float r = 0.01; return {r * t * std::cos(t), r * t * std::sin(t), r * (t + std::sin(16 * t))}; } //-------------------------------------------------------------------------------- // taskflow //-------------------------------------------------------------------------------- tf::Task generateGeoTF(tf::Taskflow& taskflow, std::vector& inGeo, size_t inGeoIndex) { size_t numPoints = numGeoPoints(inGeoIndex); Geometry& g = inGeo[inGeoIndex]; g.resize(numPoints); return taskflow.for_each_index( size_t(0), numPoints, size_t(1), [&](size_t i) { g[i] = calculateGeoPoint(inGeoIndex, i); }); } tf::Task transformGeoTF(tf::Taskflow& taskflow, Geometry& g, const Geometry& inG, const Matrix4& m) { const size_t numPoints = inG.size(); g.resize(numPoints); return taskflow.for_each_index( size_t(0), numPoints, size_t(1), [&](size_t i) { g[i] = multiply(inG[i], m); }); } void prepareGraphTF(tf::Taskflow& taskflow, Scene& scene) { std::mt19937 rng(12345); scene = generateTransformsHierarchy( params::numTransforms, params::numInGeo, params::everyNthHasGeo, rng); std::vector& inGeo = scene.inGeo; std::vector& transforms = scene.transforms; std::vector& outGeo = scene.outGeo; // calculate inGeo std::vector inGeoTasks(params::numInGeo); for (size_t i = 0; i < params::numInGeo; ++i) { inGeoTasks[i] = generateGeoTF(taskflow, inGeo, i); } // calculate transforms std::vector transformTasks(params::numTransforms); std::vector outGeoTasks(outGeo.size()); for (size_t i = 0; i < params::numTransforms; ++i) { transformTasks[i] = taskflow.emplace([&transforms, i]() { calculateWorldMatrix(transforms, i); }); const size_t parentIndex = transforms[i].parent; if (parentIndex != kRoot) { transformTasks[i].succeed(transformTasks[parentIndex]); } // calculate outGeo const size_t outGeoIndex = transforms[i].outGeoIndex; const size_t inGeoIndex = transforms[i].inGeoIndex; if (inGeoIndex != kNoGeometry) { outGeoTasks[outGeoIndex] = transformGeoTF( taskflow, outGeo[outGeoIndex], inGeo[inGeoIndex], transforms[i].worldMatrix); outGeoTasks[outGeoIndex].succeed(transformTasks[i], inGeoTasks[inGeoIndex]); } } } //-------------------------------------------------------------------------------- // dispenso //-------------------------------------------------------------------------------- void generateGeo( dispenso::ThreadPool& threadPool, std::vector& inGeo, size_t inGeoIndex) { size_t numPoints = numGeoPoints(inGeoIndex); Geometry& g = inGeo[inGeoIndex]; g.resize(numPoints); dispenso::TaskSet taskSet(threadPool); dispenso::ParForOptions options; options.maxThreads = static_cast(numPoints); options.minItemsPerChunk = 256; dispenso::parallel_for( taskSet, 0, numPoints, [&](size_t i) { g[i] = calculateGeoPoint(inGeoIndex, i); }, options); } void transformGeo( dispenso::ThreadPool& threadPool, Geometry& g, const Geometry& inG, const Matrix4& m) { dispenso::TaskSet taskSet(threadPool); const size_t numPoints = inG.size(); dispenso::ParForOptions options; options.maxThreads = static_cast(numPoints); options.minItemsPerChunk = 256; g.resize(numPoints); dispenso::parallel_for( taskSet, 0, numPoints, [&](size_t i) { g[i] = multiply(inG[i], m); }, options); } struct Subgraphs { dispenso::Subgraph* inGeo; dispenso::Subgraph* transforms; dispenso::Subgraph* outGeo; }; Subgraphs prepareGraph(dispenso::ThreadPool& threadPool, Scene& scene, dispenso::Graph& g) { std::mt19937 rng(12345); scene = generateTransformsHierarchy( params::numTransforms, params::numInGeo, params::everyNthHasGeo, rng); std::vector& inGeo = scene.inGeo; std::vector& transforms = scene.transforms; std::vector& outGeo = scene.outGeo; // calculate inGeo dispenso::Subgraph& transformsSub = g.addSubgraph(); dispenso::Subgraph& inGeoSub = g.addSubgraph(); dispenso::Subgraph& outGeoSub = g.addSubgraph(); // Reserve capacity for known sizes transformsSub.reserve(params::numTransforms); inGeoSub.reserve(params::numInGeo); for (size_t i = 0; i < params::numInGeo; ++i) { inGeoSub.addNode([&threadPool, &inGeo, i]() { generateGeo(threadPool, inGeo, i); }); } // calculate transforms for (size_t i = 0; i < params::numTransforms; ++i) { transformsSub.addNode([&transforms, i]() { calculateWorldMatrix(transforms, i); }); const size_t parentIndex = transforms[i].parent; if (parentIndex != kRoot) { transformsSub.node(i).dependsOn(transformsSub.node(parentIndex)); } // calculate outGeo const size_t outGeoIndex = transforms[i].outGeoIndex; const size_t inGeoIndex = transforms[i].inGeoIndex; if (inGeoIndex != kNoGeometry) { outGeoSub.addNode([&, i, inGeoIndex, outGeoIndex]() { transformGeo(threadPool, outGeo[outGeoIndex], inGeo[inGeoIndex], transforms[i].worldMatrix); }); outGeoSub.node(outGeoIndex).dependsOn(transformsSub.node(i), inGeoSub.node(inGeoIndex)); } } return {&inGeoSub, &transformsSub, &outGeoSub}; } //----------------------------------test results---------------------------------- #ifndef NDEBUG template bool compare(const std::array& ma, const std::array& mb) { bool result = true; for (size_t i = 0; i < N; ++i) { result = result && std::abs(ma[i] - mb[i]) < std::numeric_limits::epsilon(); } return result; } bool testScene(const Scene& scene) { bool result = true; for (const Transform& transform : scene.transforms) { const Matrix4 worldMatrix = transform.parent == kRoot ? transform.matrix : multiply(transform.matrix, scene.transforms[transform.parent].worldMatrix); result = result && compare(transform.worldMatrix, worldMatrix); } for (const Transform& transform : scene.transforms) { const size_t outGeoIndex = transform.outGeoIndex; const size_t inGeoIndex = transform.inGeoIndex; if (inGeoIndex != kNoGeometry) { const size_t numVertices = scene.inGeo[inGeoIndex].size(); assert(numVertices == scene.outGeo[outGeoIndex].size()); for (size_t i = 0; i < numVertices; ++i) { const Vec3 worldPos = multiply(scene.inGeo[inGeoIndex][i], transform.worldMatrix); result = result && compare(scene.outGeo[outGeoIndex][i], worldPos); } } } return result; } void cleanScene(Scene& s) { for (Geometry& g : s.inGeo) { g.clear(); } for (Geometry& g : s.outGeo) { g.clear(); } for (Transform& t : s.transforms) { std::fill(t.worldMatrix.begin(), t.worldMatrix.end(), 0.f); } } #endif } // anonymous namespace static void BM_scene_graph_parallel_for(benchmark::State& state) { // transform inGeo dispenso::ThreadPool& threadPool = dispenso::globalThreadPool(); dispenso::Graph g; Scene scene; prepareGraph(threadPool, scene, g); dispenso::ParallelForExecutor parallelForExecutor; dispenso::TaskSet taskSet(threadPool); for (auto _ : state) { state.PauseTiming(); setAllNodesIncomplete(g); state.ResumeTiming(); parallelForExecutor(taskSet, g); #ifndef NDEBUG state.PauseTiming(); assert(testScene(scene)); cleanScene(scene); state.ResumeTiming(); #endif } } static void BM_scene_graph_concurrent_task_set(benchmark::State& state) { // transform inGeo dispenso::ThreadPool& threadPool = dispenso::globalThreadPool(); dispenso::Graph g; Scene scene; prepareGraph(threadPool, scene, g); dispenso::ConcurrentTaskSet concurrentTaskSet(threadPool); dispenso::ConcurrentTaskSetExecutor concurrentTaskSetExecutor; for (auto _ : state) { state.PauseTiming(); setAllNodesIncomplete(g); state.ResumeTiming(); concurrentTaskSetExecutor(concurrentTaskSet, g); concurrentTaskSet.wait(); #ifndef NDEBUG state.PauseTiming(); assert(testScene(scene)); cleanScene(scene); state.ResumeTiming(); #endif } } static void BM_scene_graph_partial_revaluation(benchmark::State& state) { dispenso::ThreadPool& threadPool = dispenso::globalThreadPool(); std::mt19937 rng(123456); dispenso::Graph g; Scene scene; Subgraphs subgraphs = prepareGraph(threadPool, scene, g); const size_t numTransforms = scene.transforms.size(); std::uniform_real_distribution transformIndexDistr(0, numTransforms - 1); dispenso::ConcurrentTaskSet concurrentTaskSet(threadPool); dispenso::ConcurrentTaskSetExecutor concurrentTaskSetExecutor; dispenso::ForwardPropagator forwardPropagator; setAllNodesIncomplete(g); concurrentTaskSetExecutor(concurrentTaskSet, g); concurrentTaskSet.wait(); assert(testScene(scene)); for (auto _ : state) { state.PauseTiming(); // change several transforms rng.seed(123456); for (size_t i = 0; i < 10; ++i) { const size_t index = transformIndexDistr(rng); Transform& t = scene.transforms[index]; t.matrix = getRandomTransformMatrix(rng); subgraphs.transforms->node(index).setIncomplete(); } state.ResumeTiming(); // The graph automatically recalculates all children of modified transforms. If these transforms // have geometry this geometry will be recomputed as well. forwardPropagator(g); concurrentTaskSetExecutor(concurrentTaskSet, g); concurrentTaskSet.wait(); #ifndef NDEBUG state.PauseTiming(); assert(testScene(scene)); state.ResumeTiming(); #endif } } // TODO(roman fedotov): Add partial evaluation variant for taskflow (possible implementation: // conditional tasks) static void BM_scene_graph_taskflow(benchmark::State& state) { tf::Taskflow taskflow; Scene scene; prepareGraphTF(taskflow, scene); tf::Executor executor; for (auto _ : state) { executor.run(taskflow).wait(); #ifndef NDEBUG state.PauseTiming(); assert(testScene(scene)); cleanScene(scene); state.ResumeTiming(); #endif } } BENCHMARK(BM_scene_graph_parallel_for)->UseRealTime(); BENCHMARK(BM_scene_graph_concurrent_task_set)->UseRealTime(); BENCHMARK(BM_scene_graph_taskflow)->UseRealTime(); #ifndef NDEBUG BENCHMARK(BM_scene_graph_partial_revaluation)->UseRealTime()->Iterations(50); #else BENCHMARK(BM_scene_graph_partial_revaluation)->UseRealTime(); #endif BENCHMARK_MAIN(); ================================================ FILE: benchmarks/idle_pool_benchmark.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * Benchmarks for thread pool behavior under low/idle workloads. * Tests CPU usage and responsiveness when threads are mostly waiting. */ #include #include #if !defined(BENCHMARK_WITHOUT_TBB) #include #include "tbb_compat.h" #endif // !BENCHMARK_WITHOUT_TBB #include "thread_benchmark_common.h" using namespace std::chrono_literals; static constexpr int kSmallSize = 1000; static constexpr int kMediumSize = 10000; static constexpr int kLargeSize = 1000000; struct alignas(64) Work { size_t count = 0; void operator+=(size_t o) { count += o; } }; Work g_work[1025]; std::atomic g_tCounter{0}; inline int testTid() { static DISPENSO_THREAD_LOCAL int t = -1; if (t < 0) { t = g_tCounter.fetch_add(1, std::memory_order_acq_rel); } return t; } inline Work& work() { static DISPENSO_THREAD_LOCAL Work* w = nullptr; if (!w) { if (testTid() == 0) { w = g_work + 1024; } else { w = g_work + (testTid() & 1023); } } return *w; } #if !defined(BENCHMARK_WITHOUT_TBB) void BM_tbb_mostly_idle(benchmark::State& state) { const int num_threads = state.range(0); const int num_elements = state.range(1); struct Recurse { void operator()() const { work() += i; if (i < num_elements) { ++i; g->run(*this); } } mutable int i; mutable tbb::task_group* g; int num_elements; }; tbb_compat::task_scheduler_init initsched(num_threads); startRusage(); for (auto UNUSED_VAR : state) { tbb::task_group g; Recurse rec; rec.i = 0; rec.g = &g; rec.num_elements = num_elements; rec(); g.wait(); } endRusage(state); } void BM_tbb_very_idle(benchmark::State& state) { const int num_threads = state.range(0); tbb_compat::task_scheduler_init initsched(num_threads); startRusage(); for (auto UNUSED_VAR : state) { tbb::task_group g; g.run([]() {}); std::this_thread::sleep_for(100ms); g.run([]() {}); g.wait(); } endRusage(state); } #endif // !BENCHMARK_WITHOUT_TBB void BM_dispenso_mostly_idle(benchmark::State& state) { const int num_threads = std::max(1, state.range(0) - 1); const int num_elements = state.range(1); struct Recurse { void operator()() { work() += i; if (i < num_elements) { ++i; tasks->schedule(*this); } } int i; dispenso::ConcurrentTaskSet* tasks; int num_elements; }; dispenso::ThreadPool pool(num_threads); startRusage(); for (auto UNUSED_VAR : state) { dispenso::ConcurrentTaskSet tasks(pool); Recurse rec; rec.i = 0; rec.tasks = &tasks; rec.num_elements = num_elements; rec(); tasks.wait(); } endRusage(state); } void BM_dispenso_very_idle(benchmark::State& state) { const int num_threads = state.range(0) - 1; dispenso::ThreadPool pool(num_threads); startRusage(); for (auto UNUSED_VAR : state) { dispenso::TaskSet tasks(pool); tasks.schedule([]() {}); std::this_thread::sleep_for(100ms); tasks.schedule([]() {}); // TaskSet destructor waits for both tasks, mirroring TBB's g.wait() } endRusage(state); } static void CustomArguments(benchmark::internal::Benchmark* b) { for (int j : {kSmallSize, kMediumSize, kLargeSize}) { for (int s : pow2HalfStepThreads()) { b->Args({s, j}); } } } static void CustomArgumentsVeryIdle(benchmark::internal::Benchmark* b) { for (int s : pow2HalfStepThreads()) { b->Args({s}); } } #if !defined(BENCHMARK_WITHOUT_TBB) BENCHMARK(BM_tbb_mostly_idle)->Apply(CustomArguments)->Unit(benchmark::kMicrosecond)->UseRealTime(); BENCHMARK(BM_tbb_very_idle) ->Apply(CustomArgumentsVeryIdle) ->Unit(benchmark::kMicrosecond) ->UseRealTime(); #endif // !BENCHMARK_WITHOUT_TBB BENCHMARK(BM_dispenso_mostly_idle) ->Apply(CustomArguments) ->Unit(benchmark::kMicrosecond) ->UseRealTime(); BENCHMARK(BM_dispenso_very_idle) ->Apply(CustomArgumentsVeryIdle) ->Unit(benchmark::kMicrosecond) ->UseRealTime(); BENCHMARK_MAIN(); ================================================ FILE: benchmarks/locality_benchmark.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * Benchmark for cache/NUMA-sensitive workloads. * * Tests a repeated stencil pattern over large arrays (exceeding L3 cache) * where deterministic thread-to-memory mapping (kStatic) may outperform * dynamic chunking (kAuto) due to cache locality and NUMA effects. * * The work per element is intentionally trivial (memory-bound) so that * memory access patterns dominate performance, not compute. */ // MSVC uses __restrict, GCC/Clang use __restrict__ #ifdef _MSC_VER #define RESTRICT __restrict #else #define RESTRICT __restrict__ #endif #include #include #include #include #if defined(_OPENMP) #include #endif #if !defined(BENCHMARK_WITHOUT_TBB) #include "tbb/blocked_range.h" #include "tbb/parallel_for.h" #include "tbb_compat.h" #endif // !BENCHMARK_WITHOUT_TBB #include "thread_benchmark_common.h" // ~32MB per array (4M doubles). Two arrays = 64MB, exceeds typical per-core L2 // but may fit in shared L3 on some machines. Scale up if needed. static constexpr size_t kSmallSize = 100000; // ~800KB per array static constexpr size_t kMediumSize = 4000000; // ~32MB per array static constexpr size_t kLargeSize = 32000000; // ~256MB per array static constexpr int kPasses = 10; // Initialize arrays with deterministic data. static void initArrays(std::vector& input, std::vector& output, size_t n) { input.resize(n); output.resize(n); for (size_t i = 0; i < n; ++i) { input[i] = static_cast(i % 1000) * 0.001; } std::memset(output.data(), 0, n * sizeof(double)); } // Simple 3-point stencil: output[i] = 0.25 * (input[i-1] + 2*input[i] + input[i+1]) // Cheap compute, memory-bound. Multiple passes amplify locality effects. inline void stencilPass(const double* RESTRICT src, double* RESTRICT dst, size_t begin, size_t end, size_t n) { for (size_t i = begin; i < end; ++i) { size_t im = (i > 0) ? i - 1 : 0; size_t ip = (i < n - 1) ? i + 1 : n - 1; dst[i] = 0.25 * (src[im] + 2.0 * src[i] + src[ip]); } } // Verify output is non-garbage (spot-check a few values). static void checkOutput(const double* data, size_t n) { if (n == 0) return; // After stencil passes, values should be finite and in a reasonable range. for (size_t i = 0; i < n; i += n / 10 + 1) { if (!std::isfinite(data[i])) { std::cerr << "FAIL: non-finite value at index " << i << std::endl; abort(); } } } template void BM_serial(benchmark::State& state) { std::vector input, output; initArrays(input, output, num_elements); for (auto UNUSED_VAR : state) { for (int pass = 0; pass < kPasses; ++pass) { stencilPass(input.data(), output.data(), 0, num_elements, num_elements); std::swap(input, output); } } checkOutput(input.data(), num_elements); } void BM_dispenso_static(benchmark::State& state) { const int num_threads = state.range(0) - 1; const size_t num_elements = state.range(1); std::vector input, output; initArrays(input, output, num_elements); dispenso::ThreadPool pool(num_threads); for (auto UNUSED_VAR : state) { for (int pass = 0; pass < kPasses; ++pass) { dispenso::TaskSet tasks(pool); const double* src = input.data(); double* dst = output.data(); auto range = dispenso::makeChunkedRange(size_t{0}, num_elements, dispenso::ParForChunking::kStatic); dispenso::parallel_for(tasks, range, [src, dst, num_elements](size_t begin, size_t end) { stencilPass(src, dst, begin, end, num_elements); }); std::swap(input, output); } } checkOutput(input.data(), num_elements); } void BM_dispenso_auto(benchmark::State& state) { const int num_threads = state.range(0) - 1; const size_t num_elements = state.range(1); std::vector input, output; initArrays(input, output, num_elements); dispenso::ThreadPool pool(num_threads); for (auto UNUSED_VAR : state) { for (int pass = 0; pass < kPasses; ++pass) { dispenso::TaskSet tasks(pool); const double* src = input.data(); double* dst = output.data(); auto range = dispenso::makeChunkedRange(size_t{0}, num_elements, dispenso::ParForChunking::kAuto); dispenso::parallel_for(tasks, range, [src, dst, num_elements](size_t begin, size_t end) { stencilPass(src, dst, begin, end, num_elements); }); std::swap(input, output); } } checkOutput(input.data(), num_elements); } #if defined(_OPENMP) void BM_omp(benchmark::State& state) { const int num_threads = state.range(0); const size_t num_elements = state.range(1); std::vector input, output; initArrays(input, output, num_elements); omp_set_num_threads(num_threads); for (auto UNUSED_VAR : state) { for (int pass = 0; pass < kPasses; ++pass) { const double* src = input.data(); double* dst = output.data(); #pragma omp parallel for schedule(static) for (int64_t i = 0; i < static_cast(num_elements); ++i) { size_t im = (i > 0) ? i - 1 : 0; size_t ip = (i < num_elements - 1) ? i + 1 : num_elements - 1; dst[i] = 0.25 * (src[im] + 2.0 * src[i] + src[ip]); } std::swap(input, output); } } checkOutput(input.data(), num_elements); } #endif /* defined(_OPENMP) */ #if !defined(BENCHMARK_WITHOUT_TBB) void BM_tbb(benchmark::State& state) { const int num_threads = state.range(0); const size_t num_elements = state.range(1); std::vector input, output; initArrays(input, output, num_elements); for (auto UNUSED_VAR : state) { tbb_compat::task_scheduler_init initsched(num_threads); for (int pass = 0; pass < kPasses; ++pass) { const double* src = input.data(); double* dst = output.data(); tbb::parallel_for( tbb::blocked_range(0, num_elements), [src, dst, num_elements](const tbb::blocked_range& r) { stencilPass(src, dst, r.begin(), r.end(), num_elements); }); std::swap(input, output); } } checkOutput(input.data(), num_elements); } #endif // !BENCHMARK_WITHOUT_TBB static void CustomArguments(benchmark::internal::Benchmark* b) { for (size_t j : {kSmallSize, kMediumSize, kLargeSize}) { for (int i : {1, 4, 16, 64, 128}) { b->Args({i, static_cast(j)}); } } } BENCHMARK_TEMPLATE(BM_serial, kSmallSize); BENCHMARK_TEMPLATE(BM_serial, kMediumSize); BENCHMARK_TEMPLATE(BM_serial, kLargeSize); #if defined(_OPENMP) BENCHMARK(BM_omp)->Apply(CustomArguments)->UseRealTime(); #endif #if !defined(BENCHMARK_WITHOUT_TBB) BENCHMARK(BM_tbb)->Apply(CustomArguments)->UseRealTime(); #endif // !BENCHMARK_WITHOUT_TBB BENCHMARK(BM_dispenso_static)->Apply(CustomArguments)->UseRealTime(); BENCHMARK(BM_dispenso_auto)->Apply(CustomArguments)->UseRealTime(); BENCHMARK_MAIN(); ================================================ FILE: benchmarks/nested_for_benchmark.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include #include #if defined(_OPENMP) #include #endif #if !defined(BENCHMARK_WITHOUT_TBB) #include "tbb/blocked_range.h" #include "tbb/parallel_reduce.h" #include "tbb_compat.h" #endif // !BENCHMARK_WITHOUT_TBB #include "thread_benchmark_common.h" namespace { constexpr int kWorkMultiplier = 4; constexpr int kSmallSize = 10; constexpr int kMediumSize = 500; constexpr int kLargeSize = 2500; int g_numThreads = 0; } // namespace uint32_t getInputs(int numElements) { srand(numElements); return rand() & 127; } inline uint64_t calculate(uint64_t input, uint64_t index, size_t foo) { return std::cos( std::log( std::sin(std::exp(std::sqrt(static_cast((input ^ index) - 3 * foo * input)))))); } uint64_t calculateInnerSerial(uint64_t input, size_t foo, int numElements) { uint64_t sum = 0; for (size_t i = 0; i < kWorkMultiplier * numElements; ++i) { sum += calculate(input, i, foo); } return sum; } void checkResults(uint32_t input, uint64_t actual, int foo, size_t numElements) { if (!foo) return; if (input != getInputs(numElements)) { std::cerr << "Failed to recover input!" << std::endl; abort(); } uint64_t expected = 0; for (size_t i = 0; i < numElements; ++i) { expected += calculateInnerSerial(input, foo, numElements); } if (expected != actual) { std::cerr << "FAIL! " << expected << " vs " << actual << std::endl; abort(); } } template void BM_serial(benchmark::State& state) { auto input = getInputs(numElements); uint64_t sum = 0; int foo = 0; for (auto UNUSED_VAR : state) { sum = 0; ++foo; for (size_t j = 0; j < numElements; ++j) { sum += calculateInnerSerial(input, foo, numElements); } } checkResults(input, sum, foo, numElements); } uint64_t calculateInnerDispenso(uint64_t input, size_t foo, int numElements) { std::vector sums; sums.reserve(g_numThreads + 1); dispenso::parallel_for( sums, []() { return uint64_t{0}; }, 0, kWorkMultiplier * numElements, [input, foo](uint64_t& lsumStore, size_t i, size_t end) { uint64_t lsum = 0; for (; i != end; ++i) { lsum += calculate(input, i, foo); } lsumStore += lsum; }); uint64_t sum = 0; for (auto s : sums) { sum += s; } return sum; } void BM_dispenso(benchmark::State& state) { g_numThreads = state.range(0) - 1; const int numElements = state.range(1); dispenso::resizeGlobalThreadPool(g_numThreads); uint64_t sum = 0; int foo = 0; auto input = getInputs(numElements); for (auto UNUSED_VAR : state) { std::vector sums; sums.reserve(g_numThreads + 1); ++foo; dispenso::parallel_for( sums, []() { return uint64_t{0}; }, 0, numElements, [numElements, input, foo](uint64_t& lsumStore, size_t j, size_t end) { uint64_t lsum = 0; for (; j != end; ++j) { lsum += calculateInnerDispenso(input, foo, numElements); } lsumStore += lsum; }); sum = 0; for (auto s : sums) { sum += s; } } checkResults(input, sum, foo, numElements); } uint64_t calculateInnerDispensoAuto(uint64_t input, size_t foo, int numElements) { std::vector sums; sums.reserve(g_numThreads + 1); dispenso::ParForOptions options; options.defaultChunking = dispenso::ParForChunking::kAuto; dispenso::parallel_for( sums, []() { return uint64_t{0}; }, 0, kWorkMultiplier * numElements, [input, foo](uint64_t& lsumStore, size_t i, size_t end) { uint64_t lsum = 0; for (; i != end; ++i) { lsum += calculate(input, i, foo); } lsumStore += lsum; }, options); uint64_t sum = 0; for (auto s : sums) { sum += s; } return sum; } void BM_dispenso_auto(benchmark::State& state) { g_numThreads = state.range(0) - 1; const int numElements = state.range(1); dispenso::resizeGlobalThreadPool(g_numThreads); uint64_t sum = 0; int foo = 0; dispenso::ParForOptions options; options.defaultChunking = dispenso::ParForChunking::kAuto; auto input = getInputs(numElements); for (auto UNUSED_VAR : state) { std::vector sums; sums.reserve(g_numThreads + 1); ++foo; dispenso::parallel_for( sums, []() { return uint64_t{0}; }, 0, numElements, [numElements, input, foo](uint64_t& lsumStore, size_t j, size_t end) { uint64_t lsum = 0; for (; j != end; ++j) { lsum += calculateInnerDispensoAuto(input, foo, numElements); } lsumStore += lsum; }, options); sum = 0; for (auto s : sums) { sum += s; } } checkResults(input, sum, foo, numElements); } #if defined(_OPENMP) uint64_t calculateInnerOmp(uint64_t input, size_t foo, int numElements) { uint64_t sum = 0; #pragma omp parallel for reduction(+ : sum) for (int i = 0; i < kWorkMultiplier * numElements; ++i) { sum += calculate(input, i, foo); } return sum; } void BM_omp(benchmark::State& state) { g_numThreads = state.range(0); const int numElements = state.range(1); omp_set_num_threads(g_numThreads); uint64_t sum = 0; int foo = 0; auto input = getInputs(numElements); for (auto UNUSED_VAR : state) { sum = 0; ++foo; #pragma omp parallel for reduction(+ : sum) for (int i = 0; i < numElements; ++i) { sum += calculateInnerOmp(input, foo, numElements); } } checkResults(input, sum, foo, numElements); } #endif /*defined(_OPENMP)*/ #if !defined(BENCHMARK_WITHOUT_TBB) uint64_t calculateInnerTbb(uint64_t input, size_t foo, int numElements) { return tbb::parallel_reduce( tbb::blocked_range(0, kWorkMultiplier * numElements), uint64_t{0}, [input, foo](const tbb::blocked_range& r, uint64_t init) -> uint64_t { for (size_t a = r.begin(); a != r.end(); ++a) init += calculate(input, a, foo); return init; }, [](uint64_t x, uint64_t y) -> uint64_t { return x + y; }); } void BM_tbb(benchmark::State& state) { g_numThreads = state.range(0); const int numElements = state.range(1); uint64_t sum = 0; int foo = 0; auto input = getInputs(numElements); for (auto UNUSED_VAR : state) { tbb_compat::task_scheduler_init initsched(g_numThreads); ++foo; sum = tbb::parallel_reduce( tbb::blocked_range(0, numElements), uint64_t{0}, [numElements, input, foo](const tbb::blocked_range& r, uint64_t init) -> uint64_t { for (size_t a = r.begin(); a != r.end(); ++a) init += calculateInnerTbb(input, foo, numElements); return init; }, [](uint64_t x, uint64_t y) -> uint64_t { return x + y; }); } checkResults(input, sum, foo, numElements); } #endif // !BENCHMARK_WITHOUT_TBB static void CustomArguments(benchmark::internal::Benchmark* b) { for (int j : {kSmallSize, kMediumSize, kLargeSize}) { for (int i : pow2HalfStepThreads()) { b->Args({i, j}); } } } BENCHMARK_TEMPLATE(BM_serial, kSmallSize); BENCHMARK_TEMPLATE(BM_serial, kMediumSize); BENCHMARK_TEMPLATE(BM_serial, kLargeSize); #if defined(_OPENMP) BENCHMARK(BM_omp)->Apply(CustomArguments)->UseRealTime(); #endif #if !defined(BENCHMARK_WITHOUT_TBB) BENCHMARK(BM_tbb)->Apply(CustomArguments)->UseRealTime(); #endif // !BENCHMARK_WITHOUT_TBB BENCHMARK(BM_dispenso)->Apply(CustomArguments)->UseRealTime(); BENCHMARK(BM_dispenso_auto)->Apply(CustomArguments)->UseRealTime(); BENCHMARK_MAIN(); ================================================ FILE: benchmarks/nested_pool_benchmark.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * Benchmarks for nested/hierarchical task scheduling patterns. * Tests scenarios where tasks spawn additional child tasks. */ #include #include #if !defined(BENCHMARK_WITHOUT_TBB) #include #include "tbb_compat.h" #endif // !BENCHMARK_WITHOUT_TBB #if !defined(BENCHMARK_WITHOUT_FOLLY) #include #include #include #include #include #include #endif // !BENCHMARK_WITHOUT_FOLLY #include "thread_benchmark_common.h" static constexpr int kSmallSize = 1000; static constexpr int kMediumSize = 10000; static constexpr int kLargeSize = 300000; struct alignas(64) Work { size_t count = 0; void operator+=(size_t o) { count += o; } }; Work g_work[1025]; std::atomic g_tCounter{0}; inline int testTid() { static DISPENSO_THREAD_LOCAL int t = -1; if (t < 0) { t = g_tCounter.fetch_add(1, std::memory_order_acq_rel); } return t; } inline Work& work() { static DISPENSO_THREAD_LOCAL Work* w = nullptr; if (!w) { if (testTid() == 0) { w = g_work + 1024; } else { w = g_work + (testTid() & 1023); } } return *w; } void BM_dispenso(benchmark::State& state) { const int num_threads = state.range(0) - 1; const int num_elements = state.range(1); dispenso::ThreadPool pool(num_threads); for (auto UNUSED_VAR : state) { dispenso::TaskSet outerTasks(pool); for (int i = 0; i < num_elements; ++i) { outerTasks.schedule([&pool, num_elements]() { int num = std::sqrt(num_elements); dispenso::TaskSet tasks(pool); for (int j = 0; j < num; ++j) { tasks.schedule([j]() { work() += j; }); } }); } } } void BM_dispenso_bulk(benchmark::State& state) { const int num_threads = state.range(0) - 1; const int num_elements = state.range(1); dispenso::ThreadPool pool(num_threads); for (auto UNUSED_VAR : state) { dispenso::TaskSet outerTasks(pool); outerTasks.scheduleBulk(static_cast(num_elements), [&pool, num_elements](size_t) { return [&pool, num_elements]() { int num = std::sqrt(num_elements); dispenso::TaskSet tasks(pool); tasks.scheduleBulk( static_cast(num), [](size_t j) { return [j]() { work() += j; }; }); }; }); } } #if !defined(BENCHMARK_WITHOUT_TBB) void BM_tbb(benchmark::State& state) { const int num_threads = state.range(0); const int num_elements = state.range(1); tbb_compat::task_scheduler_init initsched(num_threads); for (auto UNUSED_VAR : state) { tbb::task_group g; for (int i = 0; i < num_elements; ++i) { g.run([num_elements]() { int num = std::sqrt(num_elements); tbb::task_group g2; for (int j = 0; j < num; ++j) { g2.run([j]() { work() += j; }); } g2.wait(); }); } g.wait(); } } #endif // !BENCHMARK_WITHOUT_TBB #if !defined(BENCHMARK_WITHOUT_FOLLY) void BM_folly(benchmark::State& state) { const int num_threads = state.range(0); const int num_elements = state.range(1); if (num_elements > 10000) { state.SkipWithError("We run out of memory here with too many elements"); } folly::CPUThreadPoolExecutor follyExec(num_threads); for (auto UNUSED_VAR : state) { folly::coro::blockingWait([&]() -> folly::coro::Task { std::vector> tasks; for (int i = 0; i < num_elements; ++i) { tasks.push_back(folly::coro::co_invoke([num_elements]() -> folly::coro::Task { co_await folly::coro::co_reschedule_on_current_executor; std::vector> tasks2; int num = std::sqrt(num_elements); for (int j = 0; j < num; ++j) { tasks2.push_back(folly::coro::co_invoke([j]() -> folly::coro::Task { co_await folly::coro::co_reschedule_on_current_executor; work() += j; })); } co_await folly::coro::collectAllRange(std::move(tasks2)); })); } co_await folly::coro::collectAllRange(std::move(tasks)).scheduleOn(&follyExec); }()); } } #endif // !BENCHMARK_WITHOUT_FOLLY static void CustomArguments(benchmark::internal::Benchmark* b) { for (int j : {kSmallSize, kMediumSize, kLargeSize}) { for (int s : pow2HalfStepThreads()) { b->Args({s, j}); } } } #if !defined(BENCHMARK_WITHOUT_TBB) BENCHMARK(BM_tbb)->Apply(CustomArguments)->Unit(benchmark::kMicrosecond)->UseRealTime(); #endif // !BENCHMARK_WITHOUT_TBB #if !defined(BENCHMARK_WITHOUT_FOLLY) BENCHMARK(BM_folly)->Apply(CustomArguments)->Unit(benchmark::kMicrosecond)->UseRealTime(); #endif // !BENCHMARK_WITHOUT_FOLLY BENCHMARK(BM_dispenso)->Apply(CustomArguments)->Unit(benchmark::kMicrosecond)->UseRealTime(); BENCHMARK(BM_dispenso_bulk)->Apply(CustomArguments)->Unit(benchmark::kMicrosecond)->UseRealTime(); BENCHMARK_MAIN(); ================================================ FILE: benchmarks/once_function_benchmark.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include "benchmark_common.h" constexpr size_t kSmallSize = 24; constexpr size_t kMediumSize = 120; constexpr size_t kLargeSize = 248; // 1000 is larger than our largest optimized chunk size, so we may expect to see performance falloff // here. constexpr size_t kExtraLargeSize = 1000; template void runMoveLoop(benchmark::State& state, Func f) { for (auto UNUSED_VAR : state) { ExeType t(f); ExeType o; for (int i = 0; i < 10; ++i) { o = std::move(t); t = std::move(o); } t(); } } template class FuncConsumer { public: void add(Func&& f) { funcs_.emplace_back(std::move(f)); } void consumeAll() { while (!funcs_.empty()) { Func f = std::move(funcs_.front()); funcs_.pop_front(); f(); } } private: std::deque funcs_; }; template struct Foo { Foo() { buf[0] = 0; benchmark::ClobberMemory(); } Foo(Foo&& f) { std::memcpy(buf, f.buf, kSize); } Foo(const Foo& f) { std::memcpy(buf, f.buf, kSize); } void operator()() { benchmark::DoNotOptimize(++buf[0]); } uint32_t buf[kSize / 4]; }; template void onceCall(F&& f) { F lf = std::move(f); lf(); } template void BM_move_std_function(benchmark::State& state) { runMoveLoop>(state, Foo()); } template void BM_move_once_function(benchmark::State& state) { runMoveLoop(state, Foo()); } constexpr int kMediumLoopLen = 200; template void BM_queue_inline_function(benchmark::State& state) { FuncConsumer> consumer; for (auto UNUSED_VAR : state) { for (int i = 0; i < kMediumLoopLen; ++i) { consumer.add(Foo()); } consumer.consumeAll(); } } template void BM_queue_std_function(benchmark::State& state) { FuncConsumer> consumer; for (auto UNUSED_VAR : state) { for (int i = 0; i < kMediumLoopLen; ++i) { consumer.add(Foo()); } consumer.consumeAll(); } } template void BM_queue_once_function(benchmark::State& state) { FuncConsumer consumer; for (auto UNUSED_VAR : state) { for (int i = 0; i < kMediumLoopLen; ++i) { consumer.add(Foo()); } consumer.consumeAll(); } } BENCHMARK_TEMPLATE(BM_move_std_function, kSmallSize); BENCHMARK_TEMPLATE(BM_move_once_function, kSmallSize); BENCHMARK_TEMPLATE(BM_move_std_function, kMediumSize); BENCHMARK_TEMPLATE(BM_move_once_function, kMediumSize); BENCHMARK_TEMPLATE(BM_move_std_function, kLargeSize); BENCHMARK_TEMPLATE(BM_move_once_function, kLargeSize); BENCHMARK_TEMPLATE(BM_move_std_function, kExtraLargeSize); BENCHMARK_TEMPLATE(BM_move_once_function, kExtraLargeSize); BENCHMARK_TEMPLATE(BM_queue_inline_function, kSmallSize); BENCHMARK_TEMPLATE(BM_queue_std_function, kSmallSize); BENCHMARK_TEMPLATE(BM_queue_once_function, kSmallSize); BENCHMARK_TEMPLATE(BM_queue_inline_function, kMediumSize); BENCHMARK_TEMPLATE(BM_queue_std_function, kMediumSize); BENCHMARK_TEMPLATE(BM_queue_once_function, kMediumSize); BENCHMARK_TEMPLATE(BM_queue_inline_function, kLargeSize); BENCHMARK_TEMPLATE(BM_queue_std_function, kLargeSize); BENCHMARK_TEMPLATE(BM_queue_once_function, kLargeSize); BENCHMARK_TEMPLATE(BM_queue_inline_function, kExtraLargeSize); BENCHMARK_TEMPLATE(BM_queue_std_function, kExtraLargeSize); BENCHMARK_TEMPLATE(BM_queue_once_function, kExtraLargeSize); BENCHMARK_MAIN(); ================================================ FILE: benchmarks/pipeline_benchmark.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include "benchmark_common.h" #if !defined(BENCHMARK_WITHOUT_TBB) #include "tbb_compat.h" #endif // !BENCHMARK_WITHOUT_TBB #include #if TF_VERSION > 300000 #include #endif // TF_VERSION // (1) Generate images // (2) Calculate geometric mean // (3) Tonemap based on geometric mean constexpr size_t kWidth = 256; constexpr size_t kHeight = 256; constexpr size_t kNumImages = 500; constexpr size_t kSeed = 55; struct Work { Work(size_t idx) : index(idx) {} Work(Work&& w) : index(w.index), geometricMean(w.geometricMean), inputImage(std::move(w.inputImage)) {} Work& operator=(Work&& w) { index = w.index; geometricMean = w.geometricMean; inputImage = std::move(w.inputImage); return *this; } size_t index; double geometricMean = 0; std::unique_ptr inputImage; }; std::vector> g_serialResults; Work fillImage(Work work) { std::mt19937 rng(kSeed + work.index); work.inputImage = std::make_unique(kWidth * kHeight); std::uniform_int_distribution dist; for (size_t i = 0; i < kWidth * kHeight; ++i) { work.inputImage[i] = dist(rng); } return work; } Work computeGeometricMean(Work work) { double sum = 0; for (size_t i = 0; i < kWidth * kHeight; ++i) { sum += std::log(1.0e-10 + work.inputImage[i]); } work.geometricMean = std::exp(sum / (kWidth * kHeight)); return work; } std::unique_ptr tonemap(Work work) { auto out = std::make_unique(kWidth * kHeight); for (size_t i = 0; i < kWidth * kHeight; ++i) { double adjLum = work.inputImage[i] / work.geometricMean; out[i] = 255 * adjLum / (1.0 + adjLum); } return out; } void runSerial() { g_serialResults.resize(kNumImages); size_t index = 0; for (auto& out : g_serialResults) { out = tonemap(computeGeometricMean(fillImage(Work(index++)))); } } void checkResults(const std::vector>& results) { if (g_serialResults.empty()) { runSerial(); } if (g_serialResults.size() != results.size()) { std::cerr << "Number of results don't match" << std::endl; std::abort(); } for (size_t i = 0; i < results.size(); ++i) { if (std::memcmp( g_serialResults[i].get(), results[i].get(), kWidth * kHeight * sizeof(uint8_t))) { std::cerr << "Mismatch in results" << std::endl; for (size_t j = 0; j < 10; ++j) { std::cerr << (int)g_serialResults[i][j] << " vs " << (int)results[i][j] << std::endl; } std::abort(); } } } void BM_serial(benchmark::State& state) { for (auto UNUSED_VAR : state) { runSerial(); } } void runDispenso(std::vector>& results) { results.resize(kNumImages); size_t counter = 0; dispenso::pipeline( [&counter]() -> dispenso::OpResult { if (counter < kNumImages) { return fillImage(Work(counter++)); } return {}; }, computeGeometricMean, [&results](Work work) { size_t index = work.index; results[index] = tonemap(std::move(work)); }); } void BM_dispenso(benchmark::State& state) { std::vector> results; (void)dispenso::globalThreadPool(); for (auto UNUSED_VAR : state) { runDispenso(results); } checkResults(results); } void runDispensoPar(std::vector>& results) { results.resize(kNumImages); std::atomic counter(0); dispenso::pipeline( dispenso::stage( [&counter]() -> dispenso::OpResult { size_t curIndex = counter.fetch_add(1, std::memory_order_acquire); if (curIndex < kNumImages) { return fillImage(Work(curIndex)); } return {}; }, dispenso::kStageNoLimit), dispenso::stage(computeGeometricMean, dispenso::kStageNoLimit), dispenso::stage( [&results](Work work) { size_t index = work.index; results[index] = tonemap(std::move(work)); }, dispenso::kStageNoLimit)); } void BM_dispenso_par(benchmark::State& state) { std::vector> results; (void)dispenso::globalThreadPool(); for (auto UNUSED_VAR : state) { runDispensoPar(results); } checkResults(results); } #if !defined(BENCHMARK_WITHOUT_TBB) void runTBB(std::vector>& results) { results.resize(kNumImages); size_t counter = 0; tbb::parallel_pipeline( /*max_number_of_live_token=*/std::thread::hardware_concurrency(), tbb::make_filter( tbb_compat::filter::serial, [&counter](tbb::flow_control& fc) -> Work* { if (counter < kNumImages) { return new Work(fillImage(Work(counter++))); } fc.stop(); return nullptr; }) & tbb::make_filter( tbb_compat::filter::serial, [](Work* workIn) { Work& work = *workIn; work = computeGeometricMean(std::move(work)); return workIn; }) & tbb::make_filter(tbb_compat::filter::serial, [&results](Work* workIn) { size_t index = workIn->index; results[index] = tonemap(std::move(*workIn)); delete workIn; })); } void BM_tbb(benchmark::State& state) { std::vector> results; for (auto UNUSED_VAR : state) { runTBB(results); } checkResults(results); } void runTBBPar(std::vector>& results) { results.resize(kNumImages); std::atomic counter(0); tbb::parallel_pipeline( /*max_number_of_live_token=*/std::thread::hardware_concurrency(), tbb::make_filter( tbb_compat::filter::parallel, [&counter](tbb::flow_control& fc) -> Work* { size_t curIndex = counter.fetch_add(1, std::memory_order_acquire); if (curIndex < kNumImages) { return new Work(fillImage(Work(curIndex))); } fc.stop(); return nullptr; }) & tbb::make_filter( tbb_compat::filter::parallel, [](Work* work) { *work = computeGeometricMean(std::move(*work)); return work; }) & tbb::make_filter(tbb_compat::filter::parallel, [&results](Work* work) { size_t index = work->index; results[index] = tonemap(std::move(*work)); delete work; })); } void BM_tbb_par(benchmark::State& state) { std::vector> results; for (auto UNUSED_VAR : state) { runTBBPar(results); } checkResults(results); } #endif // !BENCHMARK_WITHOUT_TBB #if TF_VERSION > 300000 void runTaskflow(std::vector>& results, tf::Executor& exec) { results.resize(kNumImages); std::vector> work; // Ensure we don't resize underlying buffer causing data races work.reserve(kNumImages); tf::Taskflow taskflow; size_t counter2 = 0; size_t counter3 = 0; tf::Pipeline pl( std::thread::hardware_concurrency(), tf::Pipe{ tf::PipeType::SERIAL, [&work](auto& pf) mutable { if (work.size() < kNumImages) { work.push_back(std::make_unique(fillImage(Work(work.size())))); } else { pf.stop(); } }}, tf::Pipe{ tf::PipeType::SERIAL, [&counter2, &work](auto& pf) mutable { Work& w = *work[counter2++]; w = computeGeometricMean(std::move(w)); }}, tf::Pipe{tf::PipeType::SERIAL, [&counter3, &work, &results](auto& pf) mutable { Work& w = *work[counter3]; results[counter3++] = tonemap(std::move(w)); }}); taskflow.composed_of(pl); exec.run(taskflow).wait(); } void BM_taskflow(benchmark::State& state) { std::vector> results; tf::Executor executor(std::thread::hardware_concurrency()); for (auto UNUSED_VAR : state) { runTaskflow(results, executor); } checkResults(results); } void runTaskflowPar(std::vector>& results, tf::Executor& exec) { results.resize(kNumImages); std::vector> work(kNumImages); tf::Taskflow taskflow; tf::Pipeline pl( std::thread::hardware_concurrency(), tf::Pipe{ tf::PipeType::SERIAL, // First pipe must be serial in taskflow [&work](tf::Pipeflow& pf) { if (pf.token() >= kNumImages) { pf.stop(); return; } work[pf.token()] = std::make_unique(fillImage(Work(pf.token()))); }}, tf::Pipe{ tf::PipeType::PARALLEL, [&work](tf::Pipeflow& pf) { Work& w = *work[pf.token()]; w = computeGeometricMean(std::move(w)); }}, tf::Pipe{tf::PipeType::PARALLEL, [&work, &results](tf::Pipeflow& pf) { size_t token = pf.token(); results[token] = tonemap(std::move(*work[token])); }}); taskflow.composed_of(pl); exec.run(taskflow).wait(); } void BM_taskflow_par(benchmark::State& state) { std::vector> results; tf::Executor executor(std::thread::hardware_concurrency()); for (auto UNUSED_VAR : state) { runTaskflowPar(results, executor); } checkResults(results); } #endif // TF_VERSION > 300000 BENCHMARK(BM_serial)->UseRealTime(); BENCHMARK(BM_dispenso)->UseRealTime(); #if !defined(BENCHMARK_WITHOUT_TBB) BENCHMARK(BM_tbb)->UseRealTime(); #endif // !BENCHMARK_WITHOUT_TBB #if TF_VERSION > 300000 BENCHMARK(BM_taskflow)->UseRealTime(); #endif // TF_VERSION > 300000 BENCHMARK(BM_dispenso_par)->UseRealTime(); #if !defined(BENCHMARK_WITHOUT_TBB) BENCHMARK(BM_tbb_par)->UseRealTime(); #endif // !BENCHMARK_WITHOUT_TBB #if TF_VERSION > 300000 BENCHMARK(BM_taskflow_par)->UseRealTime(); #endif // TF_VERSION > 300000 BENCHMARK_MAIN(); ================================================ FILE: benchmarks/pool_allocator_benchmark.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include "benchmark_common.h" constexpr size_t kSmallSize = 1024; constexpr size_t kMediumSize = 8192; constexpr size_t kLargeSize = 65536; template void run(benchmark::State& state, Alloc alloc, Free dealloc) { std::vector ptrs(state.range(0)); for (auto UNUSED_VAR : state) { for (char*& p : ptrs) { p = alloc(); } for (char* p : ptrs) { dealloc(p); } } } template void runArena(benchmark::State& state, PoolAlloc& allocator) { std::vector ptrs(state.range(0)); for (auto UNUSED_VAR : state) { for (char*& p : ptrs) { p = allocator.alloc(); } allocator.clear(); } } template void BM_mallocfree(benchmark::State& state) { run( state, []() { return reinterpret_cast(::malloc(kSize)); }, [](char* buf) { ::free(buf); }); } template void BM_pool_allocator(benchmark::State& state) { dispenso::PoolAllocator allocator(kSize, kSize * 32, ::malloc, ::free); run( state, [&allocator]() { return allocator.alloc(); }, [&allocator](char* buf) { allocator.dealloc(buf); }); } template void BM_nl_pool_allocator(benchmark::State& state) { dispenso::NoLockPoolAllocator allocator(kSize, kSize * 32, ::malloc, ::free); run( state, [&allocator]() { return allocator.alloc(); }, [&allocator](char* buf) { allocator.dealloc(buf); }); } template void BM_pool_allocator_arena(benchmark::State& state) { dispenso::PoolAllocator allocator(kSize, kSize * 32, ::malloc, ::free); runArena(state, allocator); } template void BM_nl_pool_allocator_arena(benchmark::State& state) { dispenso::NoLockPoolAllocator allocator(kSize, kSize * 32, ::malloc, ::free); runArena(state, allocator); } template void runThreaded(benchmark::State& state, Alloc alloc, Free dealloc) { dispenso::resizeGlobalThreadPool(kThreads); std::vector ptrsArray[kThreads]; for (auto& ptrs : ptrsArray) { ptrs.resize(state.range(0)); } for (auto UNUSED_VAR : state) { dispenso::TaskSet tasks(dispenso::globalThreadPool()); for (size_t i = 0; i < kThreads; ++i) { tasks.schedule([alloc, dealloc, &ptrs = ptrsArray[i]]() { for (char*& p : ptrs) { p = alloc(); } for (char* p : ptrs) { dealloc(p); } }); } } } template void BM_mallocfree_threaded(benchmark::State& state) { runThreaded( state, []() { return reinterpret_cast(::malloc(kSize)); }, [](char* buf) { ::free(buf); }); } template void BM_pool_allocator_threaded(benchmark::State& state) { dispenso::PoolAllocator allocator(kSize, (1 << 20), ::malloc, ::free); runThreaded( state, [&allocator]() { return allocator.alloc(); }, [&allocator](char* buf) { allocator.dealloc(buf); }); } BENCHMARK_TEMPLATE(BM_mallocfree, kSmallSize)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE(BM_pool_allocator, kSmallSize)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE(BM_nl_pool_allocator, kSmallSize)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE(BM_mallocfree, kMediumSize)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE(BM_pool_allocator, kMediumSize)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE(BM_nl_pool_allocator, kMediumSize)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE(BM_mallocfree, kLargeSize)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE(BM_pool_allocator, kLargeSize)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE(BM_nl_pool_allocator, kLargeSize)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE(BM_pool_allocator_arena, kSmallSize)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE(BM_nl_pool_allocator_arena, kSmallSize)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE(BM_pool_allocator_arena, kMediumSize)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE(BM_nl_pool_allocator_arena, kMediumSize)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE(BM_pool_allocator_arena, kLargeSize)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE(BM_nl_pool_allocator_arena, kLargeSize)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE2(BM_mallocfree_threaded, kSmallSize, 2)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE2(BM_pool_allocator_threaded, kSmallSize, 2)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE2(BM_mallocfree_threaded, kMediumSize, 2)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE2(BM_pool_allocator_threaded, kMediumSize, 2)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE2(BM_mallocfree_threaded, kLargeSize, 2)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE2(BM_pool_allocator_threaded, kLargeSize, 2)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE2(BM_mallocfree_threaded, kSmallSize, 8)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE2(BM_pool_allocator_threaded, kSmallSize, 8)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE2(BM_mallocfree_threaded, kMediumSize, 8)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE2(BM_pool_allocator_threaded, kMediumSize, 8)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE2(BM_mallocfree_threaded, kLargeSize, 8)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE2(BM_pool_allocator_threaded, kLargeSize, 8)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE2(BM_mallocfree_threaded, kSmallSize, 16)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE2(BM_pool_allocator_threaded, kSmallSize, 16)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE2(BM_mallocfree_threaded, kMediumSize, 16)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE2(BM_pool_allocator_threaded, kMediumSize, 16)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE2(BM_mallocfree_threaded, kLargeSize, 16)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE2(BM_pool_allocator_threaded, kLargeSize, 16)->Range(1 << 13, 1 << 15); BENCHMARK_MAIN(); ================================================ FILE: benchmarks/run_benchmarks.py ================================================ #!/usr/bin/env python3 # Copyright (c) Meta Platforms, Inc. and affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Benchmark runner and chart generator for dispenso. This script runs the dispenso benchmarks, collects results, and generates comparison charts. Results include machine information for reproducibility. Usage: python run_benchmarks.py [--output-dir OUTPUT] [--benchmarks PATTERN] Requirements: pip install matplotlib pandas """ import argparse import json import os import platform import re import subprocess import sys from datetime import datetime from pathlib import Path from typing import Any, Dict, List, Optional try: import matplotlib.pyplot as plt import pandas as pd HAS_PLOTTING = True except ImportError: HAS_PLOTTING = False def get_machine_info() -> Dict[str, Any]: """Gather machine information for benchmark context.""" info = { "timestamp": datetime.now().isoformat(), "platform": platform.system(), "platform_release": platform.release(), "platform_version": platform.version(), "architecture": platform.machine(), "processor": platform.processor(), "python_version": platform.python_version(), } # Try to get more detailed CPU info if platform.system() == "Linux": try: with open("/proc/cpuinfo", "r") as f: cpuinfo = f.read() # Extract model name match = re.search(r"model name\s*:\s*(.+)", cpuinfo) if match: info["cpu_model"] = match.group(1).strip() # Count cores info["cpu_cores"] = cpuinfo.count("processor\t:") # Get cache info match = re.search(r"cache size\s*:\s*(.+)", cpuinfo) if match: info["cache_size"] = match.group(1).strip() except Exception as e: info["cpu_info_error"] = str(e) # Memory info try: with open("/proc/meminfo", "r") as f: meminfo = f.read() match = re.search(r"MemTotal:\s*(\d+)", meminfo) if match: info["memory_kb"] = int(match.group(1)) info["memory_gb"] = round(int(match.group(1)) / 1024 / 1024, 1) except Exception: pass elif platform.system() == "Darwin": # macOS try: result = subprocess.run( ["sysctl", "-n", "machdep.cpu.brand_string"], capture_output=True, text=True, ) if result.returncode == 0: info["cpu_model"] = result.stdout.strip() result = subprocess.run( ["sysctl", "-n", "hw.ncpu"], capture_output=True, text=True ) if result.returncode == 0: info["cpu_cores"] = int(result.stdout.strip()) result = subprocess.run( ["sysctl", "-n", "hw.memsize"], capture_output=True, text=True ) if result.returncode == 0: mem_bytes = int(result.stdout.strip()) info["memory_gb"] = round(mem_bytes / 1024 / 1024 / 1024, 1) except Exception as e: info["cpu_info_error"] = str(e) elif platform.system() == "Windows": try: import winreg key = winreg.OpenKey( winreg.HKEY_LOCAL_MACHINE, r"HARDWARE\DESCRIPTION\System\CentralProcessor\0", ) info["cpu_model"] = winreg.QueryValueEx(key, "ProcessorNameString")[0] winreg.CloseKey(key) except Exception: pass try: result = subprocess.run( ["wmic", "computersystem", "get", "totalphysicalmemory"], capture_output=True, text=True, ) lines = result.stdout.strip().split("\n") if len(lines) > 1: mem_bytes = int(lines[1].strip()) info["memory_gb"] = round(mem_bytes / 1024 / 1024 / 1024, 1) except Exception: pass info["cpu_cores"] = os.cpu_count() return info def find_benchmarks(build_dir: Path, pattern: Optional[str] = None) -> List[Path]: """Find benchmark executables in the build directory.""" benchmarks = [] # Common locations search_paths = [ build_dir / "bin", build_dir / "benchmarks", build_dir, ] for search_path in search_paths: if not search_path.exists(): continue for f in search_path.iterdir(): if not f.is_file(): continue name = f.name if "benchmark" in name.lower(): if pattern is None or re.search(pattern, name): benchmarks.append(f) return sorted(benchmarks) def run_benchmark(benchmark_path: Path, extra_args: List[str] = None) -> Dict[str, Any]: """Run a single benchmark and return results.""" args = [str(benchmark_path), "--benchmark_format=json"] if extra_args: args.extend(extra_args) print(f"Running: {benchmark_path.name}...") try: result = subprocess.run( args, capture_output=True, text=True, timeout=600, # 10 minute timeout per benchmark ) if result.returncode != 0: return { "name": benchmark_path.name, "success": False, "error": result.stderr, } # Parse JSON output # google benchmark outputs JSON to stdout try: data = json.loads(result.stdout) return { "name": benchmark_path.name, "success": True, "data": data, } except json.JSONDecodeError: # Maybe it's not a google benchmark, try to parse stdout return { "name": benchmark_path.name, "success": True, "raw_output": result.stdout, } except subprocess.TimeoutExpired: return { "name": benchmark_path.name, "success": False, "error": "Timeout after 600 seconds", } except Exception as e: return { "name": benchmark_path.name, "success": False, "error": str(e), } def extract_benchmark_data(results: List[Dict]) -> pd.DataFrame: """Extract benchmark data into a DataFrame for plotting.""" rows = [] for result in results: if not result.get("success") or "data" not in result: continue data = result["data"] benchmarks = data.get("benchmarks", []) for bm in benchmarks: row = { "suite": result["name"].replace("_benchmark", ""), "name": bm.get("name", "unknown"), "real_time": bm.get("real_time", 0), "cpu_time": bm.get("cpu_time", 0), "time_unit": bm.get("time_unit", "ns"), "iterations": bm.get("iterations", 0), } # Extract any custom counters for key, value in bm.items(): if key not in row and isinstance(value, (int, float)): row[key] = value rows.append(row) return pd.DataFrame(rows) def generate_charts(df: pd.DataFrame, output_dir: Path): """Generate comparison charts from benchmark data.""" if df.empty: print("No data to plot") return output_dir.mkdir(parents=True, exist_ok=True) # Group by suite for suite, suite_df in df.groupby("suite"): fig, ax = plt.subplots(figsize=(12, 6)) # Bar chart of real_time suite_df_sorted = suite_df.sort_values("real_time") bars = ax.barh(suite_df_sorted["name"], suite_df_sorted["real_time"]) ax.set_xlabel(f"Time ({suite_df_sorted['time_unit'].iloc[0]})") ax.set_title(f"{suite} Benchmark Results") ax.grid(axis="x", alpha=0.3) # Add value labels for bar, val in zip(bars, suite_df_sorted["real_time"]): ax.text( val, bar.get_y() + bar.get_height() / 2, f" {val:.2f}", va="center", fontsize=8, ) plt.tight_layout() chart_path = output_dir / f"{suite}_chart.png" plt.savefig(chart_path, dpi=150) plt.close() print(f"Generated: {chart_path}") # Generate summary chart if multiple suites if df["suite"].nunique() > 1: # This would need custom logic based on what comparisons make sense pass def generate_markdown_report( machine_info: Dict[str, Any], results: List[Dict], df: pd.DataFrame, output_dir: Path, ): """Generate a markdown report of benchmark results.""" report_path = output_dir / "benchmark_report.md" with open(report_path, "w") as f: f.write("# Dispenso Benchmark Results\n\n") # Machine info f.write("## Machine Information\n\n") f.write(f"- **Date**: {machine_info.get('timestamp', 'unknown')}\n") f.write( f"- **Platform**: {machine_info.get('platform', 'unknown')} " f"{machine_info.get('platform_release', '')}\n" ) f.write(f"- **CPU**: {machine_info.get('cpu_model', 'unknown')}\n") f.write(f"- **Cores**: {machine_info.get('cpu_cores', 'unknown')}\n") f.write(f"- **Memory**: {machine_info.get('memory_gb', 'unknown')} GB\n") f.write("\n") # Results summary f.write("## Results Summary\n\n") successful = sum(1 for r in results if r.get("success")) f.write(f"- **Benchmarks run**: {len(results)}\n") f.write(f"- **Successful**: {successful}\n") f.write(f"- **Failed**: {len(results) - successful}\n\n") # Per-suite results if not df.empty: for suite in df["suite"].unique(): f.write(f"### {suite}\n\n") suite_df = df[df["suite"] == suite].sort_values("real_time") f.write("| Benchmark | Time | Unit | Iterations |\n") f.write("|-----------|------|------|------------|\n") for _, row in suite_df.iterrows(): f.write( f"| {row['name']} | {row['real_time']:.2f} | " f"{row['time_unit']} | {row['iterations']} |\n" ) f.write("\n") # Link to chart if it exists chart_path = f"{suite}_chart.png" f.write(f"![{suite} results]({chart_path})\n\n") # Failures failures = [r for r in results if not r.get("success")] if failures: f.write("## Failures\n\n") for fail in failures: f.write(f"### {fail['name']}\n\n") f.write(f"```\n{fail.get('error', 'Unknown error')}\n```\n\n") print(f"Generated report: {report_path}") def main(): parser = argparse.ArgumentParser(description="Run dispenso benchmarks") parser.add_argument( "--build-dir", "-b", type=Path, default=Path("build"), help="Build directory containing benchmark executables", ) parser.add_argument( "--output-dir", "-o", type=Path, default=Path("benchmark_results"), help="Output directory for results and charts", ) parser.add_argument( "--benchmarks", "-B", type=str, default=None, help="Regex pattern to filter benchmarks", ) parser.add_argument( "--json-only", action="store_true", help="Only output JSON, skip chart generation", ) args = parser.parse_args() # Gather machine info print("Gathering machine information...") machine_info = get_machine_info() print(f" CPU: {machine_info.get('cpu_model', 'unknown')}") print(f" Cores: {machine_info.get('cpu_cores', 'unknown')}") print(f" Memory: {machine_info.get('memory_gb', 'unknown')} GB") print() # Find benchmarks benchmarks = find_benchmarks(args.build_dir, args.benchmarks) if not benchmarks: print(f"No benchmarks found in {args.build_dir}") print("Make sure you've built with -DDISPENSO_BUILD_BENCHMARKS=ON") sys.exit(1) print(f"Found {len(benchmarks)} benchmark(s):") for b in benchmarks: print(f" - {b.name}") print() # Run benchmarks results = [] for benchmark in benchmarks: result = run_benchmark(benchmark) results.append(result) if result["success"]: print(f" ✓ {benchmark.name}") else: print(f" ✗ {benchmark.name}: {result.get('error', 'unknown error')[:50]}") # Save raw results args.output_dir.mkdir(parents=True, exist_ok=True) output_data = { "machine_info": machine_info, "results": results, } json_path = args.output_dir / "benchmark_results.json" with open(json_path, "w") as f: json.dump(output_data, f, indent=2, default=str) print(f"\nSaved raw results to: {json_path}") # Generate charts and report if not args.json_only and HAS_PLOTTING: df = extract_benchmark_data(results) if not df.empty: generate_charts(df, args.output_dir) generate_markdown_report(machine_info, results, df, args.output_dir) else: print( "No benchmark data to plot (benchmarks may not use google benchmark format)" ) elif not args.json_only and not HAS_PLOTTING: print("Warning: matplotlib/pandas not available. Charts will not be generated.") print("Install with: pip install matplotlib pandas") if __name__ == "__main__": main() ================================================ FILE: benchmarks/rw_lock_benchmark.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ // This benchmark relies on shared_lock from C++17 #if __cplusplus >= 201703L #include #include #include #include #include "thread_benchmark_common.h" constexpr size_t kNumValues = 1 << 20; // Precondition: Start < writePeriod. Note that this is enforced in BM_serial and BM_parallel template int64_t iterate(MtxType& mtx, std::vector& values, int start, int writePeriod) { int64_t total = 0; int w = start; for (auto& p : values) { if (w++ == writePeriod) { std::lock_guard lk(mtx); ++p; w = 0; } else { std::shared_lock lk(mtx); total += p; } } return total; } struct NopMutex { void lock() {} void unlock() {} void lock_shared() {} void unlock_shared() {} }; template void BM_serial(benchmark::State& state) { int writePeriod = state.range(0); std::vector values(kNumValues); int64_t total = 0; MutexT mtx; int start = 0; for (auto UNUSED_VAR : state) { total += iterate(mtx, values, start++, writePeriod); if (start == writePeriod) { start = 0; } } benchmark::DoNotOptimize(total); } static void CustomArgumentsSerial(benchmark::internal::Benchmark* b) { for (int j : {2, 8, 32, 128, 512}) { b->Args({j}); } } template void BM_parallel(benchmark::State& state) { int concurrency = state.range(0); int writePeriod = state.range(1); std::vector values(kNumValues); std::atomic total(0); MutexT mtx; int start = 0; dispenso::TaskSet tasks(dispenso::globalThreadPool()); for (auto UNUSED_VAR : state) { for (int c = 0; c < concurrency; ++c) { tasks.schedule([&total, start, &mtx, &values, writePeriod]() { total.fetch_add(iterate(mtx, values, start, writePeriod), std::memory_order_acq_rel); }); if (++start == writePeriod) { start = 0; } } tasks.wait(); } benchmark::DoNotOptimize(total.load(std::memory_order_acquire)); } static void CustomArgumentsParallel(benchmark::internal::Benchmark* b) { for (int j : {2, 8, 32, 128, 512}) { for (int s : {1, 2, 4, 8, 16, 32}) { if (s > static_cast(std::thread::hardware_concurrency())) { break; } b->Args({s, j}); } } } BENCHMARK_TEMPLATE(BM_serial, NopMutex)->Apply(CustomArgumentsSerial)->UseRealTime(); BENCHMARK_TEMPLATE(BM_serial, std::shared_mutex)->Apply(CustomArgumentsSerial)->UseRealTime(); BENCHMARK_TEMPLATE(BM_serial, dispenso::RWLock)->Apply(CustomArgumentsSerial)->UseRealTime(); BENCHMARK_TEMPLATE(BM_parallel, std::shared_mutex)->Apply(CustomArgumentsParallel)->UseRealTime(); BENCHMARK_TEMPLATE(BM_parallel, dispenso::RWLock)->Apply(CustomArgumentsParallel)->UseRealTime(); #endif // C++17 ================================================ FILE: benchmarks/simple_for_benchmark.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include #if defined(_OPENMP) #include #endif #include #if !defined(BENCHMARK_WITHOUT_TBB) #include "tbb/blocked_range.h" #include "tbb/parallel_for.h" #include "tbb_compat.h" #endif // !BENCHMARK_WITHOUT_TBB #include #if TF_VERSION > 300000 #include #endif // TF_VERSION #include "thread_benchmark_common.h" static uint32_t kSeed(8); static constexpr int kSmallSize = 1000; static constexpr int kMediumSize = 1000000; static constexpr int kLargeSize = 100000000; static constexpr uint32_t kMinSizePerChunk = 10000; const std::vector& getInputs(int num_elements) { static std::unordered_map> vecs; auto it = vecs.find(num_elements); if (it != vecs.end()) { return it->second; } // No need to use a high-quality rng for this test. srand(kSeed); std::vector values; values.reserve(num_elements); for (int i = 0; i < num_elements; ++i) { values.push_back((rand() & 255) - 127); } auto res = vecs.emplace(num_elements, std::move(values)); assert(res.second); return res.first->second; } template void BM_serial(benchmark::State& state) { std::vector output(num_elements, 0); auto& input = getInputs(num_elements); for (auto UNUSED_VAR : state) { for (size_t i = 0; i < num_elements; ++i) { output[i] = input[i] * input[i] - 3 * input[i]; } } } void checkResults(const std::vector& input, const std::vector& output) { for (size_t i = 0; i < input.size(); ++i) { if (output[i] != input[i] * input[i] - 3 * input[i]) { std::cerr << "FAIL! " << output[i] << " vs " << input[i] * input[i] - 3 * input[i] << std::endl; abort(); } } } void BM_dispenso(benchmark::State& state) { const int num_threads = state.range(0) - 1; const int num_elements = state.range(1); std::vector output(num_elements, 0); dispenso::ThreadPool pool(num_threads); dispenso::ParForOptions options; options.minItemsPerChunk = kMinSizePerChunk; auto& input = getInputs(num_elements); for (auto UNUSED_VAR : state) { dispenso::TaskSet tasks(pool); dispenso::parallel_for( tasks, 0, num_elements, [&input, &output](size_t i) { output[i] = input[i] * input[i] - 3 * input[i]; }, options); } checkResults(input, output); } void BM_taskflow(benchmark::State& state) { const int num_threads = state.range(0); const int num_elements = state.range(1); std::vector output(num_elements, 0); auto& input = getInputs(num_elements); tf::Executor executor(num_threads); tf::Taskflow taskflow; for (auto UNUSED_VAR : state) { taskflow.for_each_index(0, num_elements, 1, [&input, &output](int i) { output[i] = input[i] * input[i] - 3 * input[i]; }); executor.run(taskflow).wait(); } checkResults(input, output); } void BM_dispenso_static_chunk(benchmark::State& state) { const int num_threads = state.range(0) - 1; const int num_elements = state.range(1); std::vector output(num_elements, 0); dispenso::ThreadPool pool(num_threads); dispenso::ParForOptions options; options.minItemsPerChunk = kMinSizePerChunk; auto& input = getInputs(num_elements); for (auto UNUSED_VAR : state) { dispenso::TaskSet tasks(pool); auto range = dispenso::makeChunkedRange(0, num_elements, dispenso::ParForChunking::kStatic); dispenso::parallel_for( tasks, range, [&input, &output](size_t begin, size_t end) { for (size_t i = begin; i < end; ++i) { output[i] = input[i] * input[i] - 3 * input[i]; } }, options); } checkResults(input, output); } void BM_dispenso_auto_chunk(benchmark::State& state) { const int num_threads = state.range(0) - 1; const int num_elements = state.range(1); std::vector output(num_elements, 0); dispenso::ThreadPool pool(num_threads); dispenso::ParForOptions options; options.minItemsPerChunk = kMinSizePerChunk; auto& input = getInputs(num_elements); for (auto UNUSED_VAR : state) { dispenso::TaskSet tasks(pool); auto range = dispenso::makeChunkedRange(0, num_elements, dispenso::ParForChunking::kAuto); dispenso::parallel_for( tasks, range, [&input, &output](size_t begin, size_t end) { for (size_t i = begin; i < end; ++i) { output[i] = input[i] * input[i] - 3 * input[i]; } }, options); } checkResults(input, output); } #if defined(_OPENMP) void BM_omp(benchmark::State& state) { const int num_threads = state.range(0); const int num_elements = state.range(1); std::vector output(num_elements, 0); omp_set_num_threads(num_threads); auto& input = getInputs(num_elements); for (auto UNUSED_VAR : state) { #pragma omp parallel for for (int i = 0; i < num_elements; ++i) { output[i] = input[i] * input[i] - 3 * input[i]; } } checkResults(input, output); } #endif /*defined(_OPENMP)*/ #if !defined(BENCHMARK_WITHOUT_TBB) void BM_tbb(benchmark::State& state) { const int num_threads = state.range(0); const int num_elements = state.range(1); std::vector output(num_elements, 0); auto& input = getInputs(num_elements); for (auto UNUSED_VAR : state) { tbb_compat::task_scheduler_init initsched(num_threads); tbb::parallel_for( tbb::blocked_range(0, num_elements), [&input, &output](const tbb::blocked_range& r) { for (size_t i = r.begin(); i < r.end(); ++i) { output[i] = input[i] * input[i] - 3 * input[i]; } }); } checkResults(input, output); } #endif // !BENCHMARK_WITHOUT_TBB static void CustomArguments(benchmark::internal::Benchmark* b) { for (int j : {kSmallSize, kMediumSize, kLargeSize}) { for (int i : pow2HalfStepThreads()) { b->Args({i, j}); } } } BENCHMARK_TEMPLATE(BM_serial, kSmallSize); BENCHMARK_TEMPLATE(BM_serial, kMediumSize); BENCHMARK_TEMPLATE(BM_serial, kLargeSize); #if defined(_OPENMP) BENCHMARK(BM_omp)->Apply(CustomArguments)->UseRealTime(); #endif // OPENMP #if !defined(BENCHMARK_WITHOUT_TBB) BENCHMARK(BM_tbb)->Apply(CustomArguments)->UseRealTime(); #endif // !BENCHMARK_WITHOUT_TBB BENCHMARK(BM_taskflow)->Apply(CustomArguments)->UseRealTime(); BENCHMARK(BM_dispenso)->Apply(CustomArguments)->UseRealTime(); BENCHMARK(BM_dispenso_static_chunk)->Apply(CustomArguments)->UseRealTime(); BENCHMARK(BM_dispenso_auto_chunk)->Apply(CustomArguments)->UseRealTime(); BENCHMARK_MAIN(); ================================================ FILE: benchmarks/simple_pool_benchmark.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * Basic thread pool benchmarks comparing dispenso, TBB, and Folly. * Tests simple task scheduling throughput. */ #include #if !defined(BENCHMARK_WITHOUT_TBB) #include #include "tbb_compat.h" #endif // !BENCHMARK_WITHOUT_TBB #if !defined(BENCHMARK_WITHOUT_FOLLY) #include #include #endif // !BENCHMARK_WITHOUT_FOLLY #include "thread_benchmark_common.h" static constexpr int kSmallSize = 1000; static constexpr int kMediumSize = 10000; static constexpr int kLargeSize = 1000000; struct alignas(64) Work { size_t count = 0; void operator+=(size_t o) { count += o; } }; Work g_work[1025]; std::atomic g_tCounter{0}; inline int testTid() { static DISPENSO_THREAD_LOCAL int t = -1; if (t < 0) { t = g_tCounter.fetch_add(1, std::memory_order_acq_rel); } return t; } inline Work& work() { static DISPENSO_THREAD_LOCAL Work* w = nullptr; if (!w) { if (testTid() == 0) { w = g_work + 1024; } else { w = g_work + (testTid() & 1023); } } return *w; } void BM_dispenso(benchmark::State& state) { const int num_threads = state.range(0) - 1; const int num_elements = state.range(1); dispenso::ThreadPool pool(num_threads); for (auto UNUSED_VAR : state) { dispenso::TaskSet tasks(pool); for (int i = 0; i < num_elements; ++i) { tasks.schedule([i]() { work() += i; }); } } } void BM_dispenso_bulk(benchmark::State& state) { const int num_threads = state.range(0) - 1; const int num_elements = state.range(1); dispenso::ThreadPool pool(num_threads); for (auto UNUSED_VAR : state) { dispenso::TaskSet tasks(pool); tasks.scheduleBulk( static_cast(num_elements), [](size_t i) { return [i]() { work() += i; }; }); } } #if !defined(BENCHMARK_WITHOUT_TBB) void BM_tbb(benchmark::State& state) { const int num_threads = state.range(0); const int num_elements = state.range(1); tbb_compat::task_scheduler_init initsched(num_threads); for (auto UNUSED_VAR : state) { tbb::task_group g; for (int i = 0; i < num_elements; ++i) { g.run([i]() { work() += i; }); } g.wait(); } } #endif // !BENCHMARK_WITHOUT_TBB #if !defined(BENCHMARK_WITHOUT_FOLLY) void BM_folly(benchmark::State& state) { const int num_threads = state.range(0); const int num_elements = state.range(1); folly::CPUThreadPoolExecutor follyExec(num_threads); for (auto UNUSED_VAR : state) { folly::VirtualExecutor tasks(&follyExec); for (int i = 0; i < num_elements; ++i) { tasks.add([i]() { work() += i; }); } } } #endif // !BENCHMARK_WITHOUT_FOLLY static void CustomArguments(benchmark::internal::Benchmark* b) { for (int j : {kSmallSize, kMediumSize, kLargeSize}) { for (int s : pow2HalfStepThreads()) { b->Args({s, j}); } } } #if !defined(BENCHMARK_WITHOUT_TBB) BENCHMARK(BM_tbb)->Apply(CustomArguments)->Unit(benchmark::kMicrosecond)->UseRealTime(); #endif // !BENCHMARK_WITHOUT_TBB #if !defined(BENCHMARK_WITHOUT_FOLLY) BENCHMARK(BM_folly)->Apply(CustomArguments)->Unit(benchmark::kMicrosecond)->UseRealTime(); #endif // !BENCHMARK_WITHOUT_FOLLY BENCHMARK(BM_dispenso)->Apply(CustomArguments)->Unit(benchmark::kMicrosecond)->UseRealTime(); BENCHMARK(BM_dispenso_bulk)->Apply(CustomArguments)->Unit(benchmark::kMicrosecond)->UseRealTime(); BENCHMARK_MAIN(); ================================================ FILE: benchmarks/small_buffer_benchmark.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include #include "benchmark_common.h" constexpr size_t kSmallSize = 32; constexpr size_t kMediumSize = 128; constexpr size_t kLargeSize = 256; template void run(benchmark::State& state, Alloc alloc, Free dealloc) { std::vector ptrs(state.range(0)); for (auto UNUSED_VAR : state) { for (char*& p : ptrs) { p = alloc(); } for (char* p : ptrs) { dealloc(p); } } } template void BM_newdelete(benchmark::State& state) { run(state, []() { return new char[kSize]; }, [](char* buf) { delete[] (buf); }); } template void BM_small_buffer_allocator(benchmark::State& state) { run( state, []() { return dispenso::allocSmallBuffer(); }, [](char* buf) { dispenso::deallocSmallBuffer(buf); }); } BENCHMARK_TEMPLATE(BM_newdelete, kSmallSize)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE(BM_small_buffer_allocator, kSmallSize)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE(BM_newdelete, kMediumSize)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE(BM_small_buffer_allocator, kMediumSize)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE(BM_newdelete, kLargeSize)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE(BM_small_buffer_allocator, kLargeSize)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE(BM_newdelete, kSmallSize)->Threads(16)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE(BM_small_buffer_allocator, kSmallSize)->Threads(16)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE(BM_newdelete, kMediumSize)->Threads(16)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE(BM_small_buffer_allocator, kMediumSize)->Threads(16)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE(BM_newdelete, kLargeSize)->Threads(16)->Range(1 << 13, 1 << 15); BENCHMARK_TEMPLATE(BM_small_buffer_allocator, kLargeSize)->Threads(16)->Range(1 << 13, 1 << 15); BENCHMARK_MAIN(); ================================================ FILE: benchmarks/summing_for_benchmark.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include #include #if defined(_OPENMP) #include #endif #if !defined(BENCHMARK_WITHOUT_TBB) #include "tbb/blocked_range.h" #include "tbb/parallel_reduce.h" #include "tbb_compat.h" #endif // !BENCHMARK_WITHOUT_TBB #include "thread_benchmark_common.h" static uint32_t kSeed(8); static constexpr int kSmallSize = 1000; static constexpr int kMediumSize = 1000000; static constexpr int kLargeSize = 100000000; const std::vector& getInputs(int num_elements) { static std::unordered_map> vecs; auto it = vecs.find(num_elements); if (it != vecs.end()) { return it->second; } // No need to use a high-quality rng for this test. srand(kSeed); std::vector values; values.reserve(num_elements); for (int i = 0; i < num_elements; ++i) { values.push_back((rand() & 255) - 127); } auto res = vecs.emplace(num_elements, std::move(values)); assert(res.second); return res.first->second; } void checkResults(const std::vector& inputs, int64_t actual, int foo) { int64_t expected = 0; for (auto v : inputs) { expected += v * v - 3 * foo * v; } if (expected != actual) { std::cerr << "FAIL! " << expected << " vs " << actual << std::endl; abort(); } } template void BM_serial(benchmark::State& state) { auto& input = getInputs(num_elements); int64_t sum = 0; int foo = 0; for (auto UNUSED_VAR : state) { sum = 0; ++foo; for (size_t i = 0; i < num_elements; ++i) { sum += input[i] * input[i] - 3 * foo * input[i]; } } checkResults(input, sum, foo); } void BM_dispenso(benchmark::State& state) { const int num_threads = state.range(0) - 1; const int num_elements = state.range(1); dispenso::ThreadPool pool(num_threads); int64_t sum = 0; int foo = 0; dispenso::ParForOptions options; options.minItemsPerChunk = 50000; auto& input = getInputs(num_elements); for (auto UNUSED_VAR : state) { dispenso::TaskSet tasks(pool); std::vector sums; sums.reserve(num_threads + 1); ++foo; dispenso::parallel_for( tasks, sums, []() { return int64_t{0}; }, dispenso::makeChunkedRange(0, num_elements, dispenso::ParForChunking::kAuto), [&input, foo](int64_t& lsumStore, size_t i, size_t end) { int64_t lsum = 0; for (; i != end; ++i) { lsum += input[i] * input[i] - 3 * foo * input[i]; } lsumStore += lsum; }, options); sum = 0; for (auto s : sums) { sum += s; } } checkResults(input, sum, foo); } #if defined(_OPENMP) void BM_omp(benchmark::State& state) { const int num_threads = state.range(0); const int num_elements = state.range(1); omp_set_num_threads(num_threads); int64_t sum = 0; int foo = 0; auto& input = getInputs(num_elements); for (auto UNUSED_VAR : state) { sum = 0; ++foo; #pragma omp parallel for reduction(+ : sum) for (int i = 0; i < num_elements; ++i) { sum += input[i] * input[i] - 3 * foo * input[i]; } } checkResults(input, sum, foo); } #endif /*defined(_OPENMP)*/ #if !defined(BENCHMARK_WITHOUT_TBB) void BM_tbb(benchmark::State& state) { const int num_threads = state.range(0); const int num_elements = state.range(1); int64_t sum = 0; int foo = 0; auto& input = getInputs(num_elements); for (auto UNUSED_VAR : state) { tbb_compat::task_scheduler_init initsched(num_threads); ++foo; sum = tbb::parallel_reduce( tbb::blocked_range(&input[0], &input[0] + num_elements), int64_t{0}, [foo](const tbb::blocked_range& r, int64_t init) -> int64_t { for (const int* a = r.begin(); a != r.end(); ++a) init += *a * *a - 3 * foo * *a; return init; }, [](int64_t x, int64_t y) -> int64_t { return x + y; }); } checkResults(input, sum, foo); } #endif // !BENCHMARK_WITHOUT_TBB void BM_dispenso_static(benchmark::State& state) { const int num_threads = state.range(0) - 1; const int num_elements = state.range(1); dispenso::ThreadPool pool(num_threads); int64_t sum = 0; int foo = 0; dispenso::ParForOptions options; options.minItemsPerChunk = 50000; auto& input = getInputs(num_elements); for (auto UNUSED_VAR : state) { dispenso::TaskSet tasks(pool); std::vector sums; sums.reserve(num_threads + 1); ++foo; dispenso::parallel_for( tasks, sums, []() { return int64_t{0}; }, dispenso::makeChunkedRange(0, num_elements, dispenso::ParForChunking::kStatic), [&input, foo](int64_t& lsumStore, size_t i, size_t end) { int64_t lsum = 0; for (; i != end; ++i) { lsum += input[i] * input[i] - 3 * foo * input[i]; } lsumStore += lsum; }, options); sum = 0; for (auto s : sums) { sum += s; } } checkResults(input, sum, foo); } static void CustomArguments(benchmark::internal::Benchmark* b) { for (int j : {kSmallSize, kMediumSize, kLargeSize}) { for (int i : pow2HalfStepThreads()) { b->Args({i, j}); } } } BENCHMARK_TEMPLATE(BM_serial, kSmallSize); BENCHMARK_TEMPLATE(BM_serial, kMediumSize); BENCHMARK_TEMPLATE(BM_serial, kLargeSize); #if defined(_OPENMP) BENCHMARK(BM_omp)->Apply(CustomArguments)->UseRealTime(); #endif // OPENMP #if !defined(BENCHMARK_WITHOUT_TBB) BENCHMARK(BM_tbb)->Apply(CustomArguments)->UseRealTime(); #endif // !BENCHMARK_WITHOUT_TBB BENCHMARK(BM_dispenso)->Apply(CustomArguments)->UseRealTime(); BENCHMARK(BM_dispenso_static)->Apply(CustomArguments)->UseRealTime(); BENCHMARK_MAIN(); ================================================ FILE: benchmarks/tbb_compat.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #pragma once /** * TBB compatibility header for supporting both legacy TBB and oneTBB (2021+). * * oneTBB 2021+ removed task_scheduler_init in favor of global_control and task_arena. * oneTBB also changed pipeline.h to parallel_pipeline.h and renamed filter types. * This header provides a unified interface that works with both versions. */ #if !defined(BENCHMARK_WITHOUT_TBB) // Include version header to detect TBB version #if __has_include() #include #elif __has_include() #include #endif // Detect oneTBB (2021+) vs legacy TBB #if defined(TBB_VERSION_MAJOR) && TBB_VERSION_MAJOR >= 2021 #define TBB_USE_ONETBB 1 #else #define TBB_USE_ONETBB 0 #endif #if TBB_USE_ONETBB // oneTBB 2021+ uses global_control instead of task_scheduler_init #include #include #include #include namespace tbb_compat { // RAII wrapper for thread count control, compatible with old task_scheduler_init API class task_scheduler_init { public: explicit task_scheduler_init(int num_threads) : control_(tbb::global_control::max_allowed_parallelism, num_threads) {} private: tbb::global_control control_; }; // Pipeline filter mode compatibility // In oneTBB, tbb::filter::serial -> tbb::filter_mode::serial_in_order // In oneTBB, tbb::filter::parallel -> tbb::filter_mode::parallel namespace filter { constexpr auto serial = tbb::filter_mode::serial_in_order; constexpr auto parallel = tbb::filter_mode::parallel; } // namespace filter } // namespace tbb_compat #else // Legacy TBB (< 2021) #include #include #include namespace tbb_compat { using task_scheduler_init = tbb::task_scheduler_init; // Pipeline filter mode compatibility - just alias the legacy types namespace filter { constexpr auto serial = tbb::filter::serial; constexpr auto parallel = tbb::filter::parallel; } // namespace filter } // namespace tbb_compat #endif // TBB_USE_ONETBB #endif // !BENCHMARK_WITHOUT_TBB ================================================ FILE: benchmarks/thread_benchmark_common.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #ifdef _POSIX_C_SOURCE #include #endif // _POSIX_C_SOURCE #include #include #include #include "benchmark_common.h" inline std::vector pow2HalfStepThreads() { const int kRunningThreads = std::thread::hardware_concurrency(); std::vector result; result.push_back(1); for (int block = 2; block <= kRunningThreads; block *= 2) { int step = block / 2; for (int i = block; i < 2 * block && i <= kRunningThreads; i += step) { result.push_back(i); } } return result; } #if defined(_POSIX_C_SOURCE) || defined(__MACH__) struct rusage g_rusage; inline void startRusage() { std::atomic_thread_fence(std::memory_order_acquire); getrusage(RUSAGE_SELF, &g_rusage); std::atomic_thread_fence(std::memory_order_release); } inline double duration(struct timeval start, struct timeval end) { return (end.tv_sec + 1e-6 * end.tv_usec) - (start.tv_sec + 1e-6 * start.tv_usec); } inline void endRusage(benchmark::State& state) { std::atomic_thread_fence(std::memory_order_acquire); struct rusage res; getrusage(RUSAGE_SELF, &res); std::atomic_thread_fence(std::memory_order_release); double userTime = duration(g_rusage.ru_utime, res.ru_utime); double sysTime = duration(g_rusage.ru_stime, res.ru_stime); state.counters["\t0 User"] = userTime; state.counters["\t1 System"] = sysTime; } #else inline void startRusage() {} inline void endRusage(benchmark::State& state) {} #endif //_POSIX_C_SOURCE inline double getMean(const std::vector& data) { double sum = 0.0; for (auto d : data) { sum += d; } return sum / data.size(); } inline double getStddev(double mean, const std::vector& data) { double sumsq = 0.0; for (auto d : data) { auto dev = mean - d; sumsq += dev * dev; } return std::sqrt(sumsq / data.size()); } void doStats(const std::vector& times, benchmark::State& state) { double mean = getMean(times); state.counters["mean"] = mean; state.counters["stddev"] = getStddev(mean, times); } ================================================ FILE: benchmarks/timed_task_benchmark.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include "thread_benchmark_common.h" #if !defined(BENCHMARK_WITHOUT_FOLLY) #include #include #endif // !BENCHMARK_WITHOUT_FOLLY size_t getIterations() { // If we want to get a sense of system overhead via getrusage, we need to boost things into the // milliseconds range to get any kind of reasonable values. Typically on Linux I'm seeing that // Folly uses slightly more resources (system + user) on average, but that dispenso uses more in // the worst cases. The very worst case is still less than 1% of 1 cpu (~8 seconds active, // 50ms of system+user time). static const bool itersEnv = getenv("TEST_WITH_MANY_ITERS") != nullptr; if (itersEnv) { return 2000; } else { return 200; } } void absTimesToErrors(std::vector& times, double prevTime, double expectedDelta) { for (size_t i = 0; i < times.size(); ++i) { auto temp = times[i]; times[i] -= prevTime; times[i] -= expectedDelta; times[i] = std::abs(times[i]); prevTime = temp; } } void absTimesToSteadyErrors(std::vector& times, double prevTime, double expectedDelta) { for (size_t i = 0; i < times.size(); ++i) { times[i] -= prevTime; times[i] -= expectedDelta; times[i] = std::abs(times[i]); prevTime += expectedDelta; } } #if !defined(BENCHMARK_WITHOUT_FOLLY) template void BM_folly(benchmark::State& state) { std::vector times(getIterations()); std::atomic count(0); startRusage(); folly::FunctionScheduler fs; if (kSteady) { fs.setSteady(true); } folly::Baton fin; fs.addFunction( [&] { auto cur = count.fetch_add(1, std::memory_order_acq_rel); if (cur < times.size()) { times[cur] = dispenso::getTime(); } else if (cur == times.size()) { fin.post(); } }, std::chrono::milliseconds(kInMs), "add"); double prevTime = dispenso::getTime(); fs.start(); fin.wait(); fs.shutdown(); for (auto UNUSED_VAR : state) { // dummy iteration state.SetIterationTime(0.1); } if (kSteady) { // We need to adjust prevTime because for steady scheduling, folly tries to schedule right away. absTimesToSteadyErrors(times, prevTime - kInMs * 1e-3, kInMs * 1e-3); } else { absTimesToErrors(times, prevTime, kInMs * 1e-3); } endRusage(state); doStats(times, state); } struct FollyItem { std::vector times; std::atomic count{0}; size_t millis; folly::Baton fin; FollyItem() : times(getIterations()) {} }; template void BM_folly_mixed(benchmark::State& state) { FollyItem items[3]; items[0].millis = 1; items[1].millis = 4; items[2].millis = 3; startRusage(); folly::FunctionScheduler fs; if (kSteady) { fs.setSteady(true); } auto doSchedule = [&fs](FollyItem& fitem, const char* name) { fs.addFunction( [&] { auto cur = fitem.count.fetch_add(1, std::memory_order_acq_rel); if (cur < fitem.times.size()) { fitem.times[cur] = dispenso::getTime(); } else if (cur == fitem.times.size()) { fitem.fin.post(); } }, std::chrono::milliseconds(fitem.millis), name); }; size_t i = 0; for (auto name : {"a", "b", "c"}) { doSchedule(items[i++], name); } double prevTime = dispenso::getTime(); fs.start(); for (auto& item : items) { item.fin.wait(); } fs.shutdown(); for (auto UNUSED_VAR : state) { // dummy iteration state.SetIterationTime(0.1); } if (kSteady) { // We need to adjust prevTime because for steady scheduling, folly tries to schedule right away. for (auto& item : items) { absTimesToSteadyErrors(item.times, prevTime - item.millis * 1e-3, item.millis * 1e-3); } } else { for (auto& item : items) { absTimesToErrors(item.times, prevTime, item.millis * 1e-3); } } endRusage(state); // append all times items[0].times.insert(items[0].times.end(), items[1].times.begin(), items[1].times.end()); items[0].times.insert(items[0].times.end(), items[2].times.begin(), items[2].times.end()); doStats(items[0].times, state); } #endif dispenso::TimedTaskScheduler& getScheduler() { static const bool rtEnv = getenv("TEST_WITH_REALTIME") != nullptr; if (rtEnv) { static dispenso::TimedTaskScheduler sched(dispenso::ThreadPriority::kRealtime); return sched; } else { return dispenso::globalTimedTaskScheduler(); } } template void BM_dispenso(benchmark::State& state) { std::vector times(getIterations()); std::atomic count(0); double period = 1e-3 * kInMs; startRusage(); dispenso::CompletionEvent fin; double prevTime = dispenso::getTime(); auto type = kSteady ? dispenso::TimedTaskType::kSteady : dispenso::TimedTaskType::kNormal; auto task = getScheduler().schedule( dispenso::kImmediateInvoker, [&] { auto cur = count.fetch_add(1, std::memory_order_acq_rel); if (cur < times.size()) { times[cur] = dispenso::getTime(); } if (cur + 1 == times.size()) { fin.notify(); return false; } return true; }, prevTime + period, period, times.size(), type); fin.wait(); for (auto UNUSED_VAR : state) { // dummy iteration state.SetIterationTime(0.1); } if (kSteady) { absTimesToSteadyErrors(times, prevTime, kInMs * 1e-3); } else { absTimesToErrors(times, prevTime, kInMs * 1e-3); } endRusage(state); doStats(times, state); } struct DispensoItem { std::vector times; std::atomic count{0}; size_t millis; dispenso::CompletionEvent fin; DispensoItem() : times(getIterations()) {} }; template void BM_dispenso_mixed(benchmark::State& state) { DispensoItem items[3]; std::deque tasks; items[0].millis = 1; items[1].millis = 4; items[2].millis = 3; startRusage(); double prevTime = dispenso::getTime(); auto type = kSteady ? dispenso::TimedTaskType::kSteady : dispenso::TimedTaskType::kNormal; auto doSchedule = [&](DispensoItem& ditem) { double period = 1e-3 * ditem.millis; tasks.push_back( getScheduler().schedule( dispenso::kImmediateInvoker, [&] { auto cur = ditem.count.fetch_add(1, std::memory_order_acq_rel); if (cur < ditem.times.size()) { ditem.times[cur] = dispenso::getTime(); } if (cur + 1 == ditem.times.size()) { ditem.fin.notify(); return false; } return true; }, prevTime + period, period, ditem.times.size(), type)); }; for (size_t i = 0; i < 3; ++i) { doSchedule(items[i]); } for (auto& item : items) { item.fin.wait(); } for (auto UNUSED_VAR : state) { // dummy iteration state.SetIterationTime(0.1); } if (kSteady) { for (auto& item : items) { absTimesToSteadyErrors(item.times, prevTime, item.millis * 1e-3); } } else { for (auto& item : items) { absTimesToErrors(item.times, prevTime, item.millis * 1e-3); } } endRusage(state); // append all times items[0].times.insert(items[0].times.end(), items[1].times.begin(), items[1].times.end()); items[0].times.insert(items[0].times.end(), items[2].times.begin(), items[2].times.end()); doStats(items[0].times, state); } BENCHMARK_TEMPLATE2(BM_dispenso, 2, false)->UseManualTime(); BENCHMARK_TEMPLATE2(BM_dispenso, 4, false)->UseManualTime(); BENCHMARK_TEMPLATE2(BM_dispenso, 6, false)->UseManualTime(); BENCHMARK_TEMPLATE2(BM_dispenso, 2, true)->UseManualTime(); BENCHMARK_TEMPLATE2(BM_dispenso, 4, true)->UseManualTime(); BENCHMARK_TEMPLATE2(BM_dispenso, 6, true)->UseManualTime(); BENCHMARK_TEMPLATE(BM_dispenso_mixed, false)->UseManualTime(); BENCHMARK_TEMPLATE(BM_dispenso_mixed, true)->UseManualTime(); #if !defined(BENCHMARK_WITHOUT_FOLLY) BENCHMARK_TEMPLATE2(BM_folly, 2, false)->UseManualTime(); BENCHMARK_TEMPLATE2(BM_folly, 4, false)->UseManualTime(); BENCHMARK_TEMPLATE2(BM_folly, 6, false)->UseManualTime(); BENCHMARK_TEMPLATE2(BM_folly, 2, true)->UseManualTime(); BENCHMARK_TEMPLATE2(BM_folly, 4, true)->UseManualTime(); BENCHMARK_TEMPLATE2(BM_folly, 6, true)->UseManualTime(); BENCHMARK_TEMPLATE(BM_folly_mixed, false)->UseManualTime(); BENCHMARK_TEMPLATE(BM_folly_mixed, true)->UseManualTime(); #endif // FOLLY ================================================ FILE: benchmarks/trivial_compute_benchmark.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include #include #if defined(_OPENMP) #include #endif #if !defined(BENCHMARK_WITHOUT_TBB) #include "tbb/blocked_range.h" #include "tbb/parallel_reduce.h" #include "tbb_compat.h" #endif // !BENCHMARK_WITHOUT_TBB #include "thread_benchmark_common.h" static constexpr int kSmallSize = 100; static constexpr int kMediumSize = 1000000; static constexpr int kLargeSize = 100000000; uint32_t getInputs(int num_elements) { srand(num_elements); return rand() & 127; } inline uint64_t calculate(uint64_t input, uint64_t index, size_t foo) { return std::cos( std::log( std::sin(std::exp(std::sqrt(static_cast((input ^ index) - 3 * foo * input)))))); } void checkResults(uint32_t input, uint64_t actual, int foo, size_t num_elements) { if (!foo) return; if (input != getInputs(num_elements)) { std::cerr << "Failed to recover input!" << std::endl; abort(); } uint64_t expected = 0; for (size_t i = 0; i < num_elements; ++i) { expected += calculate(input, i, foo); } if (expected != actual) { std::cerr << "FAIL! " << expected << " vs " << actual << std::endl; abort(); } } template void BM_serial(benchmark::State& state) { auto input = getInputs(num_elements); uint64_t sum = 0; int foo = 0; for (auto UNUSED_VAR : state) { sum = 0; ++foo; for (size_t i = 0; i < num_elements; ++i) { sum += calculate(input, i, foo); } } checkResults(input, sum, foo, num_elements); } void BM_dispenso(benchmark::State& state) { const int num_threads = state.range(0) - 1; const int num_elements = state.range(1); dispenso::ThreadPool pool(num_threads); uint64_t sum = 0; int foo = 0; dispenso::ParForOptions options; options.defaultChunking = dispenso::ParForChunking::kAuto; options.minItemsPerChunk = 4000; auto input = getInputs(num_elements); for (auto UNUSED_VAR : state) { dispenso::TaskSet tasks(pool); std::vector sums; sums.reserve(num_threads + 1); ++foo; dispenso::parallel_for( tasks, sums, []() { return uint64_t{0}; }, 0, num_elements, [input, foo](uint64_t& lsumStore, int i, int end) { uint64_t lsum = 0; for (; i != end; ++i) { lsum += calculate(input, i, foo); } lsumStore += lsum; }, options); sum = 0; for (auto s : sums) { sum += s; } } checkResults(input, sum, foo, num_elements); } #if defined(_OPENMP) void BM_omp(benchmark::State& state) { const int num_threads = state.range(0); const int num_elements = state.range(1); omp_set_num_threads(num_threads); uint64_t sum = 0; int foo = 0; auto input = getInputs(num_elements); for (auto UNUSED_VAR : state) { sum = 0; ++foo; #pragma omp parallel for reduction(+ : sum) for (int i = 0; i < num_elements; ++i) { sum += calculate(input, i, foo); } } checkResults(input, sum, foo, num_elements); } #endif /* defined(_OPENMP)*/ #if !defined(BENCHMARK_WITHOUT_TBB) void BM_tbb(benchmark::State& state) { const int num_threads = state.range(0); const int num_elements = state.range(1); uint64_t sum = 0; int foo = 0; auto input = getInputs(num_elements); for (auto UNUSED_VAR : state) { tbb_compat::task_scheduler_init initsched(num_threads); ++foo; sum = tbb::parallel_reduce( tbb::blocked_range(0, num_elements), uint64_t{0}, [input, foo](const tbb::blocked_range& r, uint64_t init) -> uint64_t { for (size_t a = r.begin(); a != r.end(); ++a) init += calculate(input, a, foo); return init; }, [](uint64_t x, uint64_t y) -> uint64_t { return x + y; }); } checkResults(input, sum, foo, num_elements); } #endif // !BENCHMARK_WITHOUT_TBB void BM_dispenso_static(benchmark::State& state) { const int num_threads = state.range(0) - 1; const int num_elements = state.range(1); dispenso::ThreadPool pool(num_threads); uint64_t sum = 0; int foo = 0; dispenso::ParForOptions options; options.minItemsPerChunk = 4000; auto input = getInputs(num_elements); for (auto UNUSED_VAR : state) { dispenso::TaskSet tasks(pool); std::vector sums; sums.reserve(num_threads + 1); ++foo; dispenso::parallel_for( tasks, sums, []() { return uint64_t{0}; }, dispenso::makeChunkedRange(0, num_elements, dispenso::ParForChunking::kStatic), [input, foo](uint64_t& lsumStore, int i, int end) { uint64_t lsum = 0; for (; i != end; ++i) { lsum += calculate(input, i, foo); } lsumStore += lsum; }, options); sum = 0; for (auto s : sums) { sum += s; } } checkResults(input, sum, foo, num_elements); } static void CustomArguments(benchmark::internal::Benchmark* b) { for (int j : {kSmallSize, kMediumSize, kLargeSize}) { for (int i : pow2HalfStepThreads()) { b->Args({i, j}); } } } BENCHMARK_TEMPLATE(BM_serial, kSmallSize); BENCHMARK_TEMPLATE(BM_serial, kMediumSize); BENCHMARK_TEMPLATE(BM_serial, kLargeSize); #if defined(_OPENMP) BENCHMARK(BM_omp)->Apply(CustomArguments)->UseRealTime(); #endif // OPENMP #if !defined(BENCHMARK_WITHOUT_TBB) BENCHMARK(BM_tbb)->Apply(CustomArguments)->UseRealTime(); #endif // !BENCHMARK_WITHOUT_TBB BENCHMARK(BM_dispenso)->Apply(CustomArguments)->UseRealTime(); BENCHMARK(BM_dispenso_static)->Apply(CustomArguments)->UseRealTime(); BENCHMARK_MAIN(); ================================================ FILE: cmake/DispensoConfig.cmake.in ================================================ # # Copyright (c) Meta Platforms, Inc. and affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # @PACKAGE_INIT@ include(CMakeFindDependencyMacro) find_dependency(Threads) if(@DISPENSO_USE_SYSTEM_CONCURRENTQUEUE@) find_dependency(concurrentqueue) endif() include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@_Exports.cmake") check_required_components("@PROJECT_NAME@") ================================================ FILE: codecov.yml ================================================ # Codecov configuration for dispenso # See https://docs.codecov.com/docs/codecovyml-reference coverage: # Overall project coverage settings status: project: default: # Require at least 92% overall coverage target: 92% patch: default: # New code should have at least 80% coverage target: 80% # This is informational, not blocking informational: true # Round coverage to 1 decimal place precision: 1 # Ignore third-party code and test files ignore: - "dispenso/third-party/**" - "tests/**" - "benchmarks/**" # Comment settings for PRs comment: layout: "reach,diff,flags,files" behavior: default require_changes: true ================================================ FILE: dispenso/CMakeLists.txt ================================================ # Copyright (c) Meta Platforms, Inc. and affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. cmake_minimum_required(VERSION 3.12) file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cpp) file(GLOB_RECURSE HEADERS CONFIGURE_DEPENDS *.h) if(NOT DISPENSO_BUILD_FAST_MATH) list(FILTER SOURCES EXCLUDE REGEX "fast_math/") list(FILTER HEADERS EXCLUDE REGEX "fast_math/") endif() message("SOURCES: ${SOURCES}") if(DISPENSO_SHARED_LIB) add_compile_definitions(DISPENSO_SHARED_LIB DISPENSO_LIB_EXPORT) add_library(dispenso SHARED ${SOURCES} ${HEADERS}) target_compile_options(dispenso PRIVATE $<$>:-fvisibility=hidden> ) else() add_library(dispenso STATIC ${SOURCES} ${HEADERS}) endif() target_compile_options(dispenso PRIVATE $<$:/W3 /WX> # GCC false positive: stringop-overflow through deeply inlined atomics. # Seen with GCC 13 on Linux and GCC 14 on macOS ARM64. $<$:-Wno-stringop-overflow> $<$>:-Wall -Wextra -pedantic -Wconversion -Wno-sign-conversion -Werror> ) if(WIN32) target_compile_definitions(dispenso PUBLIC NOMINMAX) endif() target_include_directories(dispenso PUBLIC $ $ $ ) if(DISPENSO_USE_SYSTEM_CONCURRENTQUEUE) find_package(concurrentqueue CONFIG REQUIRED) target_link_libraries(dispenso PUBLIC concurrentqueue::concurrentqueue) else() target_include_directories(dispenso PUBLIC $ $ ) endif() # Export the C++ standard requirement so downstream consumers compile with at # least the same standard dispenso was built with (default: C++14). target_compile_features(dispenso PUBLIC cxx_std_${CMAKE_CXX_STANDARD}) set(CMAKE_THREAD_PREFER_PTHREAD TRUE) set(THREADS_PREFER_PTHREAD_FLAG TRUE) find_package(Threads REQUIRED) target_link_libraries(dispenso PUBLIC Threads::Threads) check_cxx_source_compiles(" #include #include std::atomic a(0); std::atomic b(0); std::atomic c(0); std::atomic d(0); int main() { ++a; ++b; ++c; return ++d; } " DISPENSO_HAS_ATOMIC_WITHOUT_LIB) if (NOT DISPENSO_HAS_ATOMIC_WITHOUT_LIB) target_link_libraries(dispenso PUBLIC atomic) endif() if(WIN32) target_link_libraries(dispenso PUBLIC Synchronization Winmm) endif() if (NOT DISPENSO_STANDALONE) return() endif() ## Install library ## set_target_properties(dispenso PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION ${PROJECT_VERSION_MAJOR}) install(TARGETS dispenso EXPORT ${PROJECT_NAME}_Exports LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} NAMELINK_SKIP # on Windows put the dlls into bin RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} # ... and the import lib into the devel package ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} ) install(EXPORT ${PROJECT_NAME}_Exports DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}-${PROJECT_VERSION} NAMESPACE Dispenso:: ) install(TARGETS dispenso LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} NAMELINK_ONLY RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} ) ## Install headers ## # Build list of directory patterns to exclude from header installation. set(_dispenso_install_excludes "") if(NOT DISPENSO_BUILD_FAST_MATH) list(APPEND _dispenso_install_excludes PATTERN "fast_math" EXCLUDE) endif() if(DISPENSO_USE_SYSTEM_CONCURRENTQUEUE) list(APPEND _dispenso_install_excludes PATTERN "third-party" EXCLUDE) endif() install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} FILES_MATCHING PATTERN *.h ${_dispenso_install_excludes} ) ## Generate and install CMake target exports ## include(CMakePackageConfigHelpers) configure_package_config_file( "${PROJECT_SOURCE_DIR}/cmake/${PROJECT_NAME}Config.cmake.in" "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake" INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}-${PROJECT_VERSION} ) write_basic_package_version_file( "${PROJECT_NAME}ConfigVersion.cmake" VERSION ${PROJECT_VERSION} COMPATIBILITY SameMajorVersion ) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}-${PROJECT_VERSION} ) ================================================ FILE: dispenso/async_request.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * @file async_request.h * @ingroup group_async * A file providing AsyncRequest. This is a bit like a lightweight channel for storing updates to * one object, mostly intended to be used as a single producer, single consumer update mechanism. **/ #pragma once #if __cplusplus >= 201703L #include #else #include #endif // C++17 #include namespace dispenso { /** * A type for making async requests. Although it is safe to use from multiple producers and * consumers, it is primarily intended to be used from single producer, single consumer. * * Typically the consumer will request an update of the value from thread 0, and the producer will * look whether an update was requested from thread 1. Once the producer determines an update was * requested (updateRequested() returns true), it calls tryEmplaceUpdate() to update the underlying * data. Then when the consumer on thread 0 next calls getUpdate(), an optional wrapper to the * updated data is returned, and the AsyncRequest object is reset (it no longer has valid data, and * no update will have yet been requested for the next update). **/ template class AsyncRequest { public: // A lightweight std::optional-like type with a subset of functionality. #if __cplusplus >= 201703L using OpResult = std::optional; #else using OpResult = detail::OpResult; #endif // C++17 /** * The consumer can call this to request an update to the underlying data. If request has already * been made or fulfilled, this is a no-op. **/ void requestUpdate() { RequestState state = kNone; state_.compare_exchange_strong(state, kNeedsUpdate, std::memory_order_acq_rel); } /** * The producer can check this to determine if an update is needed. * * @return true if an update is required, false otherwise. **/ bool updateRequested() const { return state_.load(std::memory_order_acquire) == kNeedsUpdate; } /** * The producer can try to emplace a new T object in response to a request. * @param args The arguments to emplace. * @return true if the underlying data was updated. false if the underlying data is not in need * of an update. * @note For cases where calling this superflously could be expensive, it is wise to check * updateRequested() first. **/ template bool tryEmplaceUpdate(Args&&... args) { RequestState state = kNeedsUpdate; if (!state_.compare_exchange_strong(state, kUpdating, std::memory_order_acq_rel)) { return false; } obj_.emplace(std::forward(args)...); state_.store(kReady, std::memory_order_release); return true; } /** * The consumer can attempt to get an update. * @return An optional wrapper to the underlying data. If no update is ready, nullopt is * returned. Once an update has been returned, the AsyncRequest object is returned to a state with * no underlying data. **/ OpResult getUpdate() { if (state_.load(std::memory_order_acquire) == kReady) { auto obj = std::move(obj_); state_.store(kNone, std::memory_order_release); return obj; } return {}; } private: enum RequestState { kNone, kNeedsUpdate, kUpdating, kReady }; alignas(kCacheLineSize) std::atomic state_ = {kNone}; OpResult obj_; }; } // namespace dispenso ================================================ FILE: dispenso/completion_event.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * @file completion_event.h * @ingroup group_async * A file providing a CompletionEvent type, which gives a way to signal to waiting threads that some * event has been completed. **/ #pragma once #include #include namespace dispenso { /** * A class which can be used for one-time notify/wait scenarios. It is basically a way to signal to * any waiting threads that some event has completed. There must be a single publisher thread * and zero or more waiters on arbitrary threads. reset may be called to restart a * sequence (e.g. after notify occurs and all waiters have successfully exited * wait*). **/ class CompletionEvent { public: /** * Notify any waiting threads that the event has completed. It is safe for this to be called * before threads call wait. **/ void notify() { impl_.notify(1); } /** * Wait for another thread to notify **/ void wait() const { impl_.wait(1); } /** * Peek to see if the event has been notified in any thread **/ bool completed() const { return impl_.intrusiveStatus().load(std::memory_order_acquire); } /** * Wait for another thread to notify or for the relative timeout to expire, whichever * is first. * * @return true if status is "completed", false if timed out. **/ template bool waitFor(const std::chrono::duration& relTime) const { return impl_.waitFor(1, relTime); } /** * Wait for another thread to notify or for the absolute timeout to expire, whichever * is first. * * @return true if status is "completed", false if timed out. **/ template bool waitUntil(const std::chrono::time_point& absTime) const { return impl_.waitUntil(1, absTime); } /** * Resets the event to "not-completed". This should not be called while an active * wait*\/notify sequence is still currently in play. **/ void reset() { impl_.intrusiveStatus().store(0, std::memory_order_seq_cst); } private: detail::CompletionEventImpl impl_{0}; }; } // namespace dispenso ================================================ FILE: dispenso/concurrent_object_arena.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * @file concurrent_object_arena.h * @ingroup group_containers * A file providing concurrent object arena container. **/ #pragma once #include #include #include #include #include #include namespace detail { template constexpr T log2i(const T v) { T log2 = 0, val = v; while (val >>= 1) ++log2; return log2; } } // namespace detail namespace dispenso { /** * ConcurrentObjectArena is an indexed sequence container that * allows concurrent insertion to its end. Insertion never invalidates pointers * or references to the rest of the elements. As opposet to std::vector, the * elements of a ConcurrentObjectArena are not stored * contiguously. In memory it is sequence of individually allocated fixed-size * arrays, with additional bookkeeping. The size of arrays is always power of * two (to optimize indexed access) *
 *  buffers  |<────     bufferSize      ────>|
 *    ┌─┐    ┌──────────────────────────────┐
 *    │*├───>│                              │
 *    ├─┤    ├──────────────────────────────┤
 *    │*├───>│                              │
 *    ├─┤    ├──────────────────────────────┤
 *    │*├───>│                              │
 *    └─┘    └──────────────────────────────┘
 *
**/ template struct ConcurrentObjectArena { ConcurrentObjectArena() = delete; /** * Construct a ConcurrentObjectArena with given or bigger contiguous array size * * @param minBuffSize The minimum size of the internal buffer. If given * size is not power of 2 the closest bigger power of two would be chosen. **/ explicit ConcurrentObjectArena(const Index minBuffSize) : kLog2BuffSize( ::detail::log2i(minBuffSize) + ((Index{1} << ::detail::log2i(minBuffSize)) == minBuffSize ? 0 : 1)), kBufferSize(Index{1} << kLog2BuffSize), kMask((Index{1} << kLog2BuffSize) - 1), pos_(0), allocatedSize_(0), buffers_(nullptr), buffersSize_(0), buffersPos_(0) { allocateBuffer(); allocatedSize_.store(kBufferSize, std::memory_order_relaxed); } /** * Copy constructor **/ ConcurrentObjectArena(const ConcurrentObjectArena& other) : kLog2BuffSize(other.kLog2BuffSize), kBufferSize(other.kBufferSize), kMask(other.kMask), pos_(other.pos_.load(std::memory_order_relaxed)), allocatedSize_(other.allocatedSize_.load(std::memory_order_relaxed)), buffersSize_(other.buffersSize_), buffersPos_(other.buffersPos_) { T** otherBuffers = other.buffers_.load(std::memory_order_acquire); T** newBuffers = new T*[buffersSize_]; for (Index i = 0; i < buffersSize_; ++i) { void* ptr = detail::alignedMalloc(kBufferSize * sizeof(T), alignment); #if defined(__cpp_exceptions) if (ptr == nullptr) throw std::bad_alloc(); #endif // __cpp_exceptions std::memcpy(ptr, otherBuffers[i], kBufferSize * sizeof(T)); newBuffers[i] = static_cast(ptr); } buffers_.store(newBuffers, std::memory_order_release); } /** * Move constructor **/ ConcurrentObjectArena(ConcurrentObjectArena&& other) noexcept : kLog2BuffSize(0), kBufferSize(0), kMask(0), pos_(0), allocatedSize_(0), buffers_(nullptr), buffersSize_(0), buffersPos_(0) { swap(*this, other); } ~ConcurrentObjectArena() { T** buffers = buffers_.load(std::memory_order_acquire); for (Index i = 0; i < buffersPos_; i++) detail::alignedFree(buffers[i]); delete[] buffers; for (T** p : deleteLater_) delete[] p; } /** * Copy assignment operator. This is not concurrency safe. **/ ConcurrentObjectArena& operator=( ConcurrentObjectArena const& other) { ConcurrentObjectArena copy(other); swap(*this, copy); return *this; } /** * Move assignment operator. This is not concurrency safe. **/ ConcurrentObjectArena& operator=( ConcurrentObjectArena&& other) noexcept { swap(*this, other); return *this; } /** * Grow a container * * This function is thread safe and never invalidates pointers or * references to the rest of the elements. It is lock-free if new elements * can be placed in current buffer. It locks if it allocates a new buffer. * @param delta New size of the container will be delta elements bigger. * @return index of the first element of the allocated group. **/ Index grow_by(const Index delta) { Index newPos; Index oldPos = pos_.load(std::memory_order_relaxed); do { Index curSize = allocatedSize_.load(std::memory_order_acquire); if (oldPos + delta >= curSize) { const std::lock_guard guard(resizeMutex_); curSize = allocatedSize_.load(std::memory_order_relaxed); while (oldPos + delta >= curSize) { allocateBuffer(); allocatedSize_.store(curSize + kBufferSize, std::memory_order_release); curSize = curSize + kBufferSize; } } newPos = oldPos + delta; } while (!std::atomic_compare_exchange_weak_explicit( &pos_, &oldPos, newPos, std::memory_order_release, std::memory_order_relaxed)); constructObjects(oldPos, oldPos + delta); return oldPos; } /** * Access an element of the object arena. Concurrency safe. * @param index The index of the element to access. * @return A reference to the element at index. * * @note references are stable even if other threads are inserting, but it is * still the users responsibility to avoid racing on the element itself. **/ inline const T& operator[](const Index index) const { const Index bufIndex = index >> kLog2BuffSize; const Index i = index & kMask; return buffers_.load(std::memory_order_acquire)[bufIndex][i]; } /** * Access an element of the object arena. Concurrency safe. * @param index The index of the element to access. * @return A reference to the element at index. * * @note references are stable even if other threads are inserting, but it is * still the users responsibility to avoid racing on the element itself. **/ inline T& operator[](const Index index) { return const_cast( const_cast&>(*this)[index]); } /** * Get the size of the object arena. Concurrency safe. * @return The number of elements in the object arena. Note that elements can be appended *concurrently with this call. **/ Index size() const { return pos_.load(std::memory_order_relaxed); } /** * The current capacity of the object arena. Concurrency safe. * @return The current capacity. Note that elements can be appended concurrently **/ Index capacity() const { return allocatedSize_.load(std::memory_order_relaxed); } /** * Number of the internal buffers. Concurrency safe. * @return The current number of buffers. Note that buffers can be appended concurrently **/ Index numBuffers() const { return buffersPos_; } /** * Get the pointer to the buffer. Concurrency safe. * @param index index of the buffer * @return The pointer to the buffer. **/ const T* getBuffer(const Index index) const { return buffers_.load(std::memory_order_acquire)[index]; } /** * Get the pointer to the buffer. Concurrency safe. * @param index index of the buffer * @return The pointer to the buffer. **/ T* getBuffer(const Index index) { return buffers_.load(std::memory_order_acquire)[index]; } /** * Get the used buffer size. not concurrency safe. * @param index index of the buffer. * @return The used buffer size. **/ Index getBufferSize(const Index index) const { const Index numBuffs = numBuffers(); assert(index < numBuffs); if (index < numBuffs - 1) return kBufferSize; else return pos_.load(std::memory_order_relaxed) - (kBufferSize * (numBuffs - 1)); } /** * Swap the contents of containers lhs, and rhs. This is not concurrency safe. * @param lhs object arena to swap * @param rhs object arena to swap **/ friend void swap( ConcurrentObjectArena& lhs, ConcurrentObjectArena& rhs) noexcept { using std::swap; swap(lhs.kLog2BuffSize, rhs.kLog2BuffSize); swap(lhs.kBufferSize, rhs.kBufferSize); swap(lhs.kMask, rhs.kMask); const Index rhs_pos = rhs.pos_.load(std::memory_order_relaxed); rhs.pos_.store(lhs.pos_.load(std::memory_order_relaxed), std::memory_order_relaxed); lhs.pos_.store(rhs_pos, std::memory_order_relaxed); const Index rhs_allocatedSize = rhs.allocatedSize_.load(std::memory_order_relaxed); rhs.allocatedSize_.store( lhs.allocatedSize_.load(std::memory_order_relaxed), std::memory_order_relaxed); lhs.allocatedSize_.store(rhs_allocatedSize, std::memory_order_relaxed); T** const rhs_buffers = rhs.buffers_.load(std::memory_order_acquire); rhs.buffers_.store(lhs.buffers_.load(std::memory_order_acquire), std::memory_order_release); lhs.buffers_.store(rhs_buffers, std::memory_order_release); swap(lhs.buffersSize_, rhs.buffersSize_); swap(lhs.buffersPos_, rhs.buffersPos_); swap(lhs.deleteLater_, rhs.deleteLater_); } private: void allocateBuffer() { void* ptr = detail::alignedMalloc(kBufferSize * sizeof(T), alignment); #if defined(__cpp_exceptions) if (ptr == nullptr) throw std::bad_alloc(); #endif // __cpp_exceptions if (buffersPos_ < buffersSize_) { buffers_.load(std::memory_order_acquire)[buffersPos_++] = static_cast(ptr); } else { const Index oldBuffersSize = buffersSize_; T** oldBuffers = buffers_.load(std::memory_order_acquire); buffersSize_ = oldBuffersSize == 0 ? 2 : oldBuffersSize * 2; T** newBuffers = new T*[buffersSize_]; if (oldBuffers != nullptr) { std::memcpy(newBuffers, oldBuffers, sizeof(T*) * oldBuffersSize); deleteLater_.push_back(oldBuffers); } newBuffers[buffersPos_++] = static_cast(ptr); buffers_.store(newBuffers, std::memory_order_release); } } void constructObjects(const Index beginIndex, const Index endIndex) { const Index startBuffer = beginIndex >> kLog2BuffSize; const Index endBuffer = endIndex >> kLog2BuffSize; Index bufStart = beginIndex & kMask; for (Index b = startBuffer; b <= endBuffer; ++b) { T* buf = buffers_.load(std::memory_order_acquire)[b]; const Index bufEnd = b == endBuffer ? (endIndex & kMask) : kBufferSize; for (Index i = bufStart; i < bufEnd; ++i) new (buf + i) T(); bufStart = 0; } } // // kBufferSize = 2^kLog2BuffSize // mask = 0b00011111 // ──┬── // └─number of 1s is log2BuffSize // std::mutex resizeMutex_; Index kLog2BuffSize; Index kBufferSize; Index kMask; std::atomic pos_; std::atomic allocatedSize_; std::atomic buffers_; Index buffersSize_; Index buffersPos_; std::vector deleteLater_; }; } // namespace dispenso ================================================ FILE: dispenso/concurrent_vector.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * @file concurrent_vector.h * @ingroup group_containers * A file providing a concurrent vector implementation. The basic implementation is similar to * common implementation of deques, with a two-level array structure. For ConcurrentVector, this is * an array of buffers that grow by powers of two. The interface is intended to be a reasonably * complete standin for both std::vector and tbb::concurrent_vector. When compared to std::vector, * it is missing only the .data() accessor, because it is not possible to access the contents as a * contiguous buffer. The interface is also compatible with TBB's concurrent_vector, but provides * slightly more functionality to be compatible with std::vector (e.g. .insert() and .erase()), and * also to enable higher performance without requiring double-initialization in some grow_by cases * (via .grow_by_generator()). ConcurrentVector also has a reserving Constructor that can allow for * better performance when the size (or maximum size, or even a guess at the size) is known ahead of * time. * * Like std::deque, and unlike std::vector, it is possible to use non-movable objects in * ConcurrentVector, and references are not invalidated when growing the ConcurrentVector. Iterators * are also stable under these conditions. One other important difference to the std containers, * and also to tbb::concurrentvector is that ConcurrentVector does not take an Allocator, but just * uses appropriately aligned malloc/free for allocation. * * Basically speaking, it is possible to grow the ConcurrentVector concurrently (e.g. via * .grow_by(), .emplace_back(), etc...) very quickly, and it is safe to iterate ranges of the vector * that have already been inserted (and e.g. .begin(), and .end() are thread-safe). As with TBB's * implementation, it is not safe to concurrently call .pop_back(), .reserve(), .resize(), etc... * The operation of these functions is lock-free if memory allocation is also lock-free. * * @note Synchronization model: concurrent growth operations (push_back, emplace_back, grow_by) * provide thread-safe index allocation, but do NOT guarantee that elements are visible to other * threads immediately. Reading an element that is being concurrently constructed by another thread * is a data race. Callers must establish their own happens-before relationship (e.g. via a mutex, * atomic flag, or task completion) between the writer finishing construction and the reader * accessing the element. This is consistent with tbb::concurrent_vector's model. * * As of this writing, with the benchmarks developed for testing ConcurrentVector, ConcurrentVector * appears to be faster than tbb::concurrent_vector by a factor of between about 15% and 3x * (depending on the operation). ConcurrentVector's iteration and random access is on-par with * std::deque (libstdc++), sometimes a bit faster, sometimes a bit slower, but .push_back() is about * an order of magnitude slower for serial use (note that by using one of the .grow_by() variants * could make this on-par for serial code, if applicable, see the "alternative" benchmarks). * * Most notably, in the parallel growth benchmarks, (on Clang 8, Linux, 32-core Threadripper * 2990WX), ConcurrentVector is between about 5x and 20x faster than std::vector + std::mutex, and * is about 1.6x faster than tbb::concurrent_vector. **/ #pragma once #include #include #include #include #include #include #include #include #include #include // Whether to use a non-atomic buffer pointer cache for read-hot paths. // Disabled on ARM where cache-line writes on every growth operation cause store buffer // pressure that exceeds the read-path benefit (acquire loads are effectively free for // sequential access patterns on ARM). #if !defined(DISPENSO_HAS_CACHED_PTRS) #if !defined(__aarch64__) && !defined(_M_ARM64) #define DISPENSO_HAS_CACHED_PTRS 1 #else #define DISPENSO_HAS_CACHED_PTRS 0 #endif #endif namespace dispenso { /** * Available strategies to use for ConcurrentVector reallocation. kFullBufferAhead means that once * we begin to use a buffer, we will also ensure that the next buffer is allocated. kHalfBufferAhead * means that once we begin to use an address halfway through the current buffer, we will allocate * the next buffer. kAsNeeded means that once we are ready to use an address in the next buffer (we * are using the last valid address in the current buffer), we allocate the next buffer. kAsNeeded * should be the default to avoid using too much memory, but a small speedup is possible due to less * thread waiting when the other options are used. kFullBufferAhead is fastest, then * kHalfBufferAhead, then kAsNeeded, and the correspondingly kFullBufferAhead uses the most memory, * then kHalfBufferAhead, and kAsNeeded uses the least. **/ enum class ConcurrentVectorReallocStrategy { kFullBufferAhead, kHalfBufferAhead, kAsNeeded }; struct ReserveTagS {}; /** * This ReserveTag can be passed into the reserving constructor for better performance, if even a * rough guess can be made at the number of elements that will be required. **/ constexpr ReserveTagS ReserveTag; // Textual inclusion. Includes undocumented implementation details, e.g. iterators. #include /** * The default ConcurrentVector traits type for defining capacities and max capacities. Both * members are required should one wish to supply a custom set of traits. **/ template struct DefaultConcurrentVectorSizeTraits { /** * @brief This is the starting user-expected capacity in number of elements (algorithm may provide * more based on kReallocStrategy) **/ static constexpr size_t kDefaultCapacity = (sizeof(T) >= 256) ? 2 : 512 / sizeof(T); /** * @brief The maximum possible size for the vector. * * The reason this exists is because if someone doesn't require vectors of length in e.g. * petabytes, the class can use less space. Additionally, some minor optimizations may be * possible if the max size is less than 32-bits. **/ static constexpr size_t kMaxVectorSize = (size_t{1} << (sizeof(size_t) * CHAR_BIT > 47 ? 47 : sizeof(size_t) * CHAR_BIT - 1)) / sizeof(T); }; /** * The default ConcurrentVector traits type. All members are required should one wish to supply a * custom set of traits. **/ struct DefaultConcurrentVectorTraits { /** * @brief Prefer to place the pointers to the buffers inline in the class. * * This can consume a lot of space, e.g. on the stack. This can be a couple of kilobytes. But * performance is improved by 5% to 15%. When set to false, this results in those pointers * residing in a separate heap allocation. **/ static constexpr bool kPreferBuffersInline = true; /** * @brief How far/whether to allocate before memory is required. * * We can allocate ahead, which may reduce the chances of another thread blocking while waiting * for memory to be allocated. This implies a pretty stiff memory overhead, which people may not * want to pay (up to 3x overhead after the first bucket for kFullBufferAhead). This can be set * to kAsNeeded instead of kFullBufferAhead, which makes overheads similar to typical std::vector * implementation (up to 2x overhead). Performance differences are modest, so unless you need to * squeeze the most out of your use, kAsNeeded is okay. **/ static constexpr ConcurrentVectorReallocStrategy kReallocStrategy = ConcurrentVectorReallocStrategy::kAsNeeded; /** * @brief Should we prefer faster, but larger iterators, or slower, but smaller iterators. * * If an algorithm needs to store iterators for later use, size overhead could become a * concern. By using kIteratorPreferSpeed == true, the size of an iterator will be roughly double * the iterator when kIteratorPreferSpeed == false. Conversely, iterate + dereference is also * about twice as fast. * **/ static constexpr bool kIteratorPreferSpeed = true; }; /** * A concurrent vector type. It is safe to call .push_back(), .emplace_back(), the various .grow_() * functions, .begin(), .end(), .size(), .empty() concurrently, and existing iterators and *references remain valid with these functions' use. **/ template < typename T, typename Traits = DefaultConcurrentVectorTraits, typename SizeTraits = DefaultConcurrentVectorSizeTraits> class ConcurrentVector { public: using value_type = T; using reference = T&; using const_reference = const T&; using size_type = size_t; using difference_type = ssize_t; using reference_type = T&; using const_reference_type = const T&; using pointer = T*; using const_pointer = const T*; using iterator = std::conditional_t< Traits::kIteratorPreferSpeed, cv::ConcurrentVectorIterator, T, false>, cv::CompactCVecIterator, T, false>>; using const_iterator = std::conditional_t< Traits::kIteratorPreferSpeed, cv::ConcurrentVectorIterator, T, true>, cv::CompactCVecIterator, T, true>>; using reverse_iterator = std::reverse_iterator; using const_reverse_iterator = std::reverse_iterator; /** * Default construct the ConcurrentVector. **/ ConcurrentVector() : ConcurrentVector(SizeTraits::kDefaultCapacity / 2, ReserveTag) {} /** * The reserving constructor. By supplying a reasonable starting capacity, e.g. close to max * expected vector size, this can often improve performance substantially by reducing allocations * and increasing data coherency. **/ ConcurrentVector(size_t startCapacity, ReserveTagS) : firstBucketShift_( detail::log2( detail::nextPow2(std::max(startCapacity, SizeTraits::kDefaultCapacity / 2)))), firstBucketLen_(size_type{1} << firstBucketShift_) { T* firstTwo = cv::alloc(2 * firstBucketLen_); initCachedPtrs(); setCachedPtr(0, firstTwo); setCachedPtr(1, firstTwo + firstBucketLen_); buffers_[0].store(firstTwo, std::memory_order_release); buffers_[1].store(firstTwo + firstBucketLen_, std::memory_order_release); } /** * Sizing constructor with default initialization. **/ explicit ConcurrentVector(size_t startSize) : ConcurrentVector(startSize, ReserveTag) { size_.store(startSize, std::memory_order_relaxed); T* buf = buffers_[0].load(std::memory_order_relaxed); for (size_t i = 0; i < startSize; ++i) { new (buf + i) T(); } } /** * Sizing constructor with specified default value. **/ ConcurrentVector(size_t startSize, const T& defaultValue) : ConcurrentVector(startSize, ReserveTag) { size_.store(startSize, std::memory_order_relaxed); T* buf = buffers_[0].load(std::memory_order_relaxed); for (size_t i = 0; i < startSize; ++i) { new (buf + i) T(defaultValue); } } /** * Constructor taking an iterator range. **/ template ConcurrentVector(InIterator start, InIterator end) : ConcurrentVector(std::distance(start, end), start, end) {} /** * Sizing constructor taking an iterator range. If size is known in advance, this may be faster * than just providing the iterator range, especially for input iterators that are not random * access. **/ template ConcurrentVector(size_type startSize, InIterator start, InIterator end) : ConcurrentVector(startSize, ReserveTag) { size_.store(startSize, std::memory_order_relaxed); assert(std::distance(start, end) == static_cast(startSize)); internalInit(start, end, begin()); } /** * Construct via initializer list **/ ConcurrentVector(std::initializer_list l) : ConcurrentVector(l.size(), std::begin(l), std::end(l)) {} /** * Copy constructor **/ ConcurrentVector(const ConcurrentVector& other) : ConcurrentVector(other.size(), other.cbegin(), other.cend()) {} /** * Move constructor **/ ConcurrentVector(ConcurrentVector&& other) : buffers_(std::move(other.buffers_)), firstBucketShift_(other.firstBucketShift_), firstBucketLen_(other.firstBucketLen_), size_(other.size_.load(std::memory_order_relaxed)) { moveCachedPtrsFrom(other); other.size_.store(0, std::memory_order_relaxed); // This is possibly unnecessary overhead, but enables the "other" vector to be in a valid, // usable state right away, no empty check or clear required, as it is for std::vector. T* firstTwo = cv::alloc(2 * firstBucketLen_); other.setCachedPtr(0, firstTwo); other.setCachedPtr(1, firstTwo + firstBucketLen_); other.buffers_[0].store(firstTwo, std::memory_order_relaxed); other.buffers_[1].store(firstTwo + firstBucketLen_, std::memory_order_relaxed); } /** * Copy assignment operator. This is not concurrency safe. **/ ConcurrentVector& operator=(const ConcurrentVector& other) { if (&other == this) { return *this; } clear(); reserve(other.size()); size_.store(other.size(), std::memory_order_relaxed); internalInit(other.cbegin(), other.cend(), begin()); return *this; } /** * Move assignment operator. This is not concurrency safe. **/ ConcurrentVector& operator=(ConcurrentVector&& other) { using std::swap; if (&other == this) { return *this; } clear(); swap(firstBucketShift_, other.firstBucketShift_); swap(firstBucketLen_, other.firstBucketLen_); buffers_ = std::move(other.buffers_); swapCachedPtrs(other); size_t curLen = size_.load(std::memory_order_relaxed); size_.store(other.size_.load(std::memory_order_relaxed), std::memory_order_relaxed); other.size_.store(curLen, std::memory_order_relaxed); return *this; } /** * Assign the vector. Not concurrency safe. * * @param count The number of elements to have in the vector * @param value The value to copy into each element **/ void assign(size_type count, const T& value) { clear(); reserve(count); size_.store(count, std::memory_order_relaxed); internalFillN(begin(), count, value); } /** * Assign the vector. Not concurrency safe. * * @param start The beginning of the iterator range to assign into the vector * @param end The end of the iterator range to assign into the vector **/ template < typename It, typename = typename std::iterator_traits::difference_type, typename = typename std::iterator_traits::pointer, typename = typename std::iterator_traits::reference, typename = typename std::iterator_traits::value_type, typename = typename std::iterator_traits::iterator_category> void assign(It start, It end) { clear(); auto count = std::distance(start, end); reserve(count); size_.store(count, std::memory_order_relaxed); internalInit(start, end, begin()); } /** * Reserve some capacity for the vector. Not concurrency safe. * * @param capacity The amount of space to ensure is available to avoid further allocations. **/ void reserve(difference_type capacity) { auto bend = bucketAndSubIndex(capacity); allocateBufferRange({0, 0, firstBucketLen_}, capacity, bend); } /** * Resize the vector, using default initialization for any new values. Not concurrency safe. * * @param len The length of the vector after the resize. **/ void resize(difference_type len) { difference_type curLen = static_cast(size_.load(std::memory_order_relaxed)); if (curLen < len) { grow_to_at_least(len); } else if (curLen > len) { auto it = end(); auto newEnd = begin() + len; do { --it; it->~T(); } while (it != newEnd); size_.store(len, std::memory_order_relaxed); } } /** * Resize the vector, copying the provided value into any new elements. Not concurrency safe. * * @param len The length of the vector after the resize. * @param value The value to copy into any new elements. **/ void resize(difference_type len, const T& value) { difference_type curLen = static_cast(size_.load(std::memory_order_relaxed)); if (curLen < len) { grow_to_at_least(len, value); } else if (curLen > len) { auto it = end(); auto newEnd = begin() + len; do { --it; it->~T(); } while (it != newEnd); size_.store(len, std::memory_order_relaxed); } } /** * The default capacity of this vector. * @return The capacity if the vector were cleared and then called shrink_to_fit. **/ size_type default_capacity() const { return 2 * firstBucketLen_; } /** * The current capacity of the vector. Not concurrency safe. * @return The current capacity. **/ size_type capacity() const { size_t cap = 2 * firstBucketLen_; for (size_t b = 2; b < kMaxBuffers; ++b) { if (!buffers_[b].load(std::memory_order_relaxed)) { break; } cap *= 2; } return cap; } /** * Clear the vector. This does not deallocate buffers, but just ensures that all elements are * destructed. Size will be zero after the call. Not concurrency safe. **/ void clear() { auto binfo = bucketAndSubIndex(size_.load(std::memory_order_relaxed)); size_t len = binfo.bucketIndex; size_t cap = binfo.bucketCapacity; size_t b = binfo.bucket; do { T* buf = buffers_[b].load(std::memory_order_relaxed); T* t = buf + len; while (t != buf) { --t; t->~T(); } cap >>= int{b > 1}; len = cap; } while (b--); size_.store(0, std::memory_order_release); } /** * Gets rid of extra capacity that is not needed to maintain preconditions. At least the default * capacity will remain, even if the size of the vector is zero. Not concurrency safe. **/ void shrink_to_fit() { constexpr size_t kMaxExtra = 2; auto binfo = bucketAndSubIndex(size_.load(std::memory_order_relaxed)); // We need to at least skip the first two buckets, since we have those as a single allocation. size_t startBucket = std::max(2, binfo.bucket + kMaxExtra); for (size_t b = startBucket; b < kMaxBuffers; ++b) { T* ptr = buffers_[b].load(std::memory_order_relaxed); if (!ptr) { break; } if (buffers_.shouldDealloc(b)) { cv::dealloc(ptr); } clearCachedPtr(b); buffers_[b].store(nullptr, std::memory_order_release); } } /** * Destruct the vector. **/ ~ConcurrentVector() { clear(); shrink_to_fit(); cv::dealloc(buffers_[0].load(std::memory_order_acquire)); } /** * Insert a value. Not concurrency safe. * @param pos The point of insertion. * @param value The value to copy into the element at pos. * @return The iterator at the inserted position. **/ iterator insert(const_iterator pos, const T& value) { auto it = insertPartial(pos); new (&*it) T(value); return it; } /** * Insert a value. Not concurrency safe. * @param pos The point of insertion. * @param value The value to move into the element at pos. * @return The iterator at the inserted position. **/ iterator insert(const_iterator pos, T&& value) { auto it = insertPartial(pos); new (&*it) T(std::move(value)); return it; } /** * Insert a value. Not concurrency safe. * @param pos The point of insertion. * @param count The number of elements to insert. * @param value The value to copy into each inserted element starting at pos. * @return The iterator at the inserted position. **/ iterator insert(const_iterator pos, size_type count, const T& value) { auto it = insertPartial(pos, count); std::fill_n(it, count, value); return it; } /** * Insert a range of values. Not concurrency safe. * @param pos The point of insertion. * @param first The start of the input iterator range. * @param last The end of the input iterator range. * @return The iterator at the inserted position. **/ template < typename InputIt, typename = typename std::iterator_traits::difference_type, typename = typename std::iterator_traits::pointer, typename = typename std::iterator_traits::reference, typename = typename std::iterator_traits::value_type, typename = typename std::iterator_traits::iterator_category> iterator insert(const_iterator pos, InputIt first, InputIt last) { size_t len = std::distance(first, last); auto it = insertPartial(pos, len); std::copy_n(first, len, it); return it; } /** * Insert an initializer_list. Not concurrency safe. * @param pos The point of insertion. * @param ilist The initializer_list. * @return The iterator at the inserted position. **/ iterator insert(const_iterator pos, std::initializer_list ilist) { return insert(pos, ilist.begin(), ilist.end()); } /** * Erase one element. Not concurrency safe. * @param pos The point of erasure. * @return Iterator following the removed element. If pos refers to the last element, then * the end() iterator is returned. **/ iterator erase(const_iterator pos) { auto e = end(); if (e == pos) { return e; } size_.fetch_sub(1, std::memory_order_relaxed); --e; if (e == pos) { e->~T(); return e; } ++e; auto it = begin(); it += (pos - it); return std::move(pos + 1, const_iterator(e), it); } /** * Erase a range of elements. Not concurrency safe. * @param first The starting point of erasure. * @param last The ending point of erasure. * @return Iterator following the last removed element. If pos refers to the last element, then * the end() iterator is returned. If last==end() prior to removal, then the updated end() * iterator is returned. If [first, last) is an empty range, then last is returned. **/ iterator erase(const_iterator first, const_iterator last) { size_t len = std::distance(first, last); auto it = begin(); size_t startIdx = first - it; if (len == 0) { return it + (startIdx + len); } it += startIdx; auto e_it = std::move(last, cend(), it); if (e_it < last) { // remove any values that were not already moved into do { --last; last->~T(); } while (e_it != last); } size_.fetch_sub(len, std::memory_order_relaxed); return e_it; } /** * Push a value onto the end of the vector. Concurrency safe. * @param val The value to copy into the new element. * @return The iterator at the point of insertion. **/ iterator push_back(const T& val) { return emplace_back(val); } /** * Push a value onto the end of the vector. Concurrency safe. * @param val The value to move into the new element. * @return The iterator at the point of insertion. **/ iterator push_back(T&& val) { return emplace_back(std::move(val)); } /** * Push a value onto the end of the vector. Concurrency safe. * @param args The arg pack used to construct the new element. * @return The iterator at the point of insertion. **/ template iterator emplace_back(Args&&... args) { auto index = size_.fetch_add(1, std::memory_order_relaxed); auto binfo = bucketAndSubIndex(index); allocateBuffer(binfo); iterator ret{this, index, binfo}; if (Traits::kIteratorPreferSpeed) { new (&*ret) T(std::forward(args)...); } else { new (buffers_[binfo.bucket] + binfo.bucketIndex) T(std::forward(args)...); } return ret; } /** * Grow the vector, constructing new elements via a generator. Concurrency safe. * @param delta The number of elements to grow by. * @param gen The generator to use to construct new elements. Must have operator()() and return a * valid T. * @return The iterator to the start of the grown range. **/ template iterator grow_by_generator(size_type delta, Gen gen) { iterator ret = growByUninitialized(delta); for (auto it = ret; delta--; ++it) { new (&*it) T(gen()); } return ret; } /** * Grow the vector, copying the provided value into new elements. Concurrency safe. * @param delta The number of elements to grow by. * @param t The value to copy into all new elements. * @return The iterator to the start of the grown range. **/ iterator grow_by(size_type delta, const T& t) { iterator ret = growByUninitialized(delta); internalFillN(ret, delta, t); return ret; } /** * Grow the vector, default initializing new elements. Concurrency safe. * @param delta The number of elements to grow by. * @return The iterator to the start of the grown range. **/ iterator grow_by(size_type delta) { iterator ret = growByUninitialized(delta); internalFillDefaultN(ret, delta); return ret; } /** * Grow the vector with an input iterator range. Concurrency safe. * @param start The start of the input iterator range. * @param end The end of the input iterator range. * @return The iterator to the start of the grown range. **/ template < typename It, typename = typename std::iterator_traits::difference_type, typename = typename std::iterator_traits::pointer, typename = typename std::iterator_traits::reference, typename = typename std::iterator_traits::value_type, typename = typename std::iterator_traits::iterator_category> iterator grow_by(It start, It end) { iterator ret = growByUninitialized(std::distance(start, end)); internalInit(start, end, ret); return ret; } /** * Grow the vector with an initializer_list. Concurrency safe. * @param initList The initializer_list to use to initialize the newly added range. * @return The iterator to the start of the grown range. **/ iterator grow_by(std::initializer_list initList) { return grow_by(std::begin(initList), std::end(initList)); } /** * Grow the vector to at least the desired size, default initializing new elements. Concurrency * safe. * @param n The required length * @return The iterator to the nth element. **/ iterator grow_to_at_least(size_type n) { size_t curSize = size_.load(std::memory_order_relaxed); if (curSize < n) { return grow_by(n - curSize); } return {this, n - 1, bucketAndSubIndex(n - 1)}; } /** * Grow the vector to at least the desired size, copying the provided value into new elements. * Concurrency safe. * @param n The required length * @param t The value to copy into each new element. * @return The iterator to the nth element. **/ iterator grow_to_at_least(size_type n, const T& t) { size_t curSize = size_.load(std::memory_order_relaxed); if (curSize < n) { return grow_by(n - curSize, t); } return {this, n - 1, bucketAndSubIndex(n - 1)}; } /** * Pop the last element off the vector. Not concurrency safe. **/ void pop_back() { auto binfo = bucketAndSubIndex(size_.fetch_sub(1, std::memory_order_relaxed) - 1); T* elt = buffers_[binfo.bucket].load(std::memory_order_relaxed) + binfo.bucketIndex; elt->~T(); } /** * Access an element of the vector. Concurrency safe. * @param index The index of the element to access. * @return A reference to the element at index. * * @note Iterators and references are stable even if other threads are inserting, but it is * still the users responsibility to avoid racing on the element itself. **/ const T& operator[](size_type index) const { auto binfo = bucketAndSubIndexForIndex(index); T* buf = cachedBuffer(binfo.bucket); return buf[binfo.bucketIndex]; } /** * Access an element of the vector. Concurrency safe. * @param index The index of the element to access. * @return A reference to the element at index. * * @note Iterators and references are stable even if other threads are inserting, but it is * still the users responsibility to avoid racing on the element itself. **/ T& operator[](size_type index) { auto binfo = bucketAndSubIndexForIndex(index); T* buf = cachedBuffer(binfo.bucket); return buf[binfo.bucketIndex]; } /** * Access an element of the vector. Out-of-boundes accesses generate an exception. Concurrency * safe. * @param index The index of the element to access. * @return A reference to the element at index. * * @note Iterators and references are stable even if other threads are inserting, but it is * still the users responsibility to avoid racing on the element itself. **/ const T& at(size_type index) const { #if defined(__cpp_exceptions) if (index >= size_.load(std::memory_order_relaxed)) { throw std::out_of_range("Index too large"); } #endif return operator[](index); } /** * Access an element of the vector. Out-of-boundes accesses generate an exception. Concurrency * safe. * @param index The index of the element to access. * @return A reference to the element at index. * * @note Iterators and references are stable even if other threads are inserting, but it is * still the users responsibility to avoid racing on the element itself. **/ T& at(size_type index) { #if defined(__cpp_exceptions) if (index >= size_.load(std::memory_order_relaxed)) { throw std::out_of_range("Index too large"); } #endif return operator[](index); } /** * Get an iterator to the start of the vector. Concurrency safe. * @return An iterator to the start of the vector. * * @note Iterators and references are stable even if other threads are inserting, but it is * still the users responsibility to avoid racing on the element itself. **/ iterator begin() { return {this, 0, {0, 0, firstBucketLen_}}; } /** * Get an iterator to the end of the vector. Concurrency safe. * @return An iterator to the end of the vector. Note that it is possible for the end to change * concurrently with this call. * * @note Iterators and references are stable even if other threads are inserting, but it is * still the users responsibility to avoid racing on the element itself. **/ iterator end() { size_t curSize = size_.load(std::memory_order_relaxed); auto binfo = bucketAndSubIndex(curSize); return {this, curSize, binfo}; } /** * Get an iterator to the start of the vector. Concurrency safe. * @return An iterator to the start of the vector. * * @note Iterators and references are stable even if other threads are inserting, but it is * still the users responsibility to avoid racing on the element itself. **/ const_iterator begin() const { return {this, 0, {0, 0, firstBucketLen_}}; } /** * Get an iterator to the end of the vector. Concurrency safe. * @return An iterator to the end of the vector. Note that it is possible for the end to change * concurrently with this call. * * @note Iterators and references are stable even if other threads are inserting, but it is * still the users responsibility to avoid racing on the element itself. **/ const_iterator end() const { size_t curSize = size_.load(std::memory_order_relaxed); auto binfo = bucketAndSubIndex(curSize); return {this, curSize, binfo}; } /** * Get an iterator to the start of the vector. Concurrency safe. * @return An iterator to the start of the vector. * * @note Iterators and references are stable even if other threads are inserting, but it is * still the users responsibility to avoid racing on the element itself. **/ const_iterator cbegin() const { return {this, 0, {0, 0, firstBucketLen_}}; } /** * Get an iterator to the end of the vector. Concurrency safe. * @return An iterator to the end of the vector. Note that it is possible for the end to change * concurrently with this call. * * @note Iterators and references are stable even if other threads are inserting, but it is * still the users responsibility to avoid racing on the element itself. **/ const_iterator cend() const { size_t curSize = size_.load(std::memory_order_relaxed); auto binfo = bucketAndSubIndex(curSize); return {this, curSize, binfo}; } /** * Get a starting reverse iterator to the vector. Concurrency safe. * @return A starting reverse iterator to the vector. Note that it is possible for the the rbegin * to change concurrently with this call. * * @note Iterators and references are stable even if other threads are inserting, but it is * still the users responsibility to avoid racing on the element itself. **/ reverse_iterator rbegin() { return reverse_iterator(end()); } /** * Get an ending reverse iterator to the vector. Concurrency safe. * @return An ending reverse iterator to the vector. * * @note Iterators and references are stable even if other threads are inserting, but it is * still the users responsibility to avoid racing on the element itself. **/ reverse_iterator rend() { return reverse_iterator(begin()); } /** * Get a starting reverse iterator to the vector. Concurrency safe. * @return A starting reverse iterator to the vector. Note that it is possible for the the rbegin * to change concurrently with this call. * * @note Iterators and references are stable even if other threads are inserting, but it is * still the users responsibility to avoid racing on the element itself. **/ const_reverse_iterator rbegin() const { return const_reverse_iterator(cend()); } /** * Get an ending reverse iterator to the vector. Concurrency safe. * @return An ending reverse iterator to the vector. * * @note Iterators and references are stable even if other threads are inserting, but it is * still the users responsibility to avoid racing on the element itself. **/ const_reverse_iterator rend() const { return const_reverse_iterator(cbegin()); } /** * Checks if the vector contains any elements. Concurrency safe. * @return true if the vector contains no elements, false otherwise. Note that an element could * be appended concurrently with this call. **/ bool empty() const { return size_.load(std::memory_order_relaxed) == 0; } /** * Get the maximum size a vector of this type can theoretically have. Concurrency safe * (constexpr). * @return The max size a vector of this type could theoretically have. **/ constexpr size_type max_size() const noexcept { return Traits::kMaxVectorSize; } /** * Get the size of the vector. Concurrency safe. * @return The number of elements in the vector. Note that elements can be appended concurrently * with this call. **/ size_type size() const { return size_.load(std::memory_order_relaxed); } /** * Get the front element of the vector. Concurrency safe. * @return A reference to the first element in the vector. **/ T& front() { return *buffers_[0].load(std::memory_order_relaxed); } /** * Get the front element of the vector. Concurrency safe. * @return A reference to the first element in the vector. **/ const T& front() const { return *buffers_[0].load(std::memory_order_relaxed); } /** * Get the last element of the vector. Concurrency safe. * @return A reference to the last element in the vector. Note that elements could be appended * concurrently with this call (in which case it won't be the back anymore). **/ T& back() { return *(end() - 1); } /** * Get the last element of the vector. Concurrency safe. * @return A reference to the last element in the vector. Note that elements could be appended * concurrently with this call (in which case it won't be the back anymore). **/ const T& back() const { return *(end() - 1); } /** * Swap the contents (and iterators, and element references) of the current vector with oth. * @param oth The vector to swap with. **/ void swap(ConcurrentVector& oth) { using std::swap; swap(firstBucketShift_, oth.firstBucketShift_); swap(firstBucketLen_, oth.firstBucketLen_); size_t othlen = oth.size_.load(std::memory_order_relaxed); oth.size_.store(size_.load(std::memory_order_relaxed), std::memory_order_relaxed); size_.store(othlen, std::memory_order_relaxed); // okay, this relies on the fact that we're essentially swapping in the move operator. buffers_ = std::move(oth.buffers_); swapCachedPtrs(oth); } private: DISPENSO_INLINE cv::BucketInfo bucketAndSubIndexForIndex(size_t index) const { #if defined(_MSC_VER) || defined(__aarch64__) || defined(_M_ARM64) // Branching fast path — benefits MSVC and ARM where branch prediction handles // sequential access patterns well, avoiding log2/division for the common case. if (index < firstBucketLen_) { return {0, index, firstBucketLen_}; } size_t l2idx = detail::log2(index); size_t bucket = (l2idx + 1) - firstBucketShift_; size_t bucketCapacity = size_t{1} << l2idx; size_t bucketIndex = index - bucketCapacity; return {bucket, bucketIndex, bucketCapacity}; #else return bucketAndSubIndex(index); #endif // _MSC_VER || __aarch64__ || _M_ARM64 } DISPENSO_INLINE cv::BucketInfo bucketAndSubIndex(size_t index) const { size_t l2idx = detail::log2(index | 1); size_t bucket = (l2idx + 1) - firstBucketShift_; size_t bucketCapacity = size_t{1} << l2idx; size_t bucketIndex = index - bucketCapacity; bucket = index < firstBucketLen_ ? 0 : bucket; bucketIndex = index < firstBucketLen_ ? index : bucketIndex; bucketCapacity = index < firstBucketLen_ ? firstBucketLen_ : bucketCapacity; return {bucket, bucketIndex, bucketCapacity}; } template void internalInit(InIterator start, InIterator end, iterator it) { while (start != end) { new (&*it) T(*start); ++it; ++start; } } void internalFillN(iterator it, size_t len, const T& value) { for (; len--; ++it) { new (&*it) T(value); } } void internalFillDefaultN(iterator it, size_t len) { for (; len--; ++it) { new (&*it) T(); } } iterator growByUninitialized(size_type delta) { auto index = size_.fetch_add(delta, std::memory_order_relaxed); auto binfo = bucketAndSubIndex(index); auto bend = bucketAndSubIndex(index + delta); allocateBufferRange(binfo, delta, bend); return {this, index, binfo}; } iterator insertPartial(const_iterator pos) { auto e = end(); auto index = size_.fetch_add(1, std::memory_order_relaxed); auto binfo = bucketAndSubIndex(index); allocateBuffer(binfo); new (&*e) T(); return std::move_backward(pos, const_iterator(e), e + 1) - 1; } iterator insertPartial(const_iterator pos, size_t len) { auto e = end(); auto index = size_.fetch_add(len, std::memory_order_relaxed); auto binfo = bucketAndSubIndex(index); auto bend = bucketAndSubIndex(index + len); allocateBufferRange(binfo, len, bend); for (auto it = e + len; it != e;) { --it; new (&*it) T(); } return std::move_backward(pos, const_iterator(e), e + len) - len; } alignas(kCacheLineSize) cv::ConVecBuffer< T, SizeTraits::kDefaultCapacity / 2, SizeTraits::kMaxVectorSize, Traits::kPreferBuffersInline, Traits::kReallocStrategy> buffers_; static constexpr size_t kMaxBuffers = detail::log2const(SizeTraits::kMaxVectorSize / (SizeTraits::kDefaultCapacity / 2)) + 1; #if DISPENSO_HAS_CACHED_PTRS // Non-atomic cache of buffer pointers for read-hot paths. Buffer pointers are write-once // (never change after allocation) and are stored here BEFORE the release store to buffers_[], // so any thread that acquires from buffers_[] is guaranteed to see the cached value. // All concurrent writes to a given slot store the same value. // Disabled on ARM where cache-line invalidation pressure exceeds the read-path benefit. alignas(kCacheLineSize) mutable T* cachedPtrs_[kMaxBuffers]; #endif size_t firstBucketShift_; size_t firstBucketLen_; alignas(kCacheLineSize) std::atomic size_{0}; DISPENSO_INLINE T* cachedBuffer(size_t bucket) const { #if DISPENSO_HAS_CACHED_PTRS return cachedPtrs_[bucket]; #else // Relaxed is sufficient: callers only access indices that have been fully constructed, which // requires an external happens-before with the writing thread. That synchronization // transitively carries the buffer pointer visibility (which was release-stored during // allocation). This matches the original pre-cache memory ordering and avoids the cost of // ldar (load-acquire) on AArch64. return buffers_[bucket].load(std::memory_order_relaxed); #endif } DISPENSO_INLINE void allocateBuffer(const cv::BucketInfo& binfo) { #if DISPENSO_HAS_CACHED_PTRS buffers_.allocAsNecessary(binfo, cachedPtrs_); #else buffers_.allocAsNecessary(binfo); #endif } DISPENSO_INLINE void allocateBufferRange(const cv::BucketInfo& binfo, ssize_t rangeLen, const cv::BucketInfo& bend) { #if DISPENSO_HAS_CACHED_PTRS buffers_.allocAsNecessary(binfo, rangeLen, bend, cachedPtrs_); #else buffers_.allocAsNecessary(binfo, rangeLen, bend); #endif } DISPENSO_INLINE void initCachedPtrs() { #if DISPENSO_HAS_CACHED_PTRS for (size_t i = 0; i < kMaxBuffers; ++i) { cachedPtrs_[i] = nullptr; } #endif } DISPENSO_INLINE void setCachedPtr(size_t bucket, T* ptr) { #if DISPENSO_HAS_CACHED_PTRS cachedPtrs_[bucket] = ptr; #else (void)bucket; (void)ptr; #endif } DISPENSO_INLINE void clearCachedPtr(size_t bucket) { #if DISPENSO_HAS_CACHED_PTRS cachedPtrs_[bucket] = nullptr; #else (void)bucket; #endif } DISPENSO_INLINE void moveCachedPtrsFrom(ConcurrentVector& other) { #if DISPENSO_HAS_CACHED_PTRS for (size_t i = 0; i < kMaxBuffers; ++i) { cachedPtrs_[i] = other.cachedPtrs_[i]; other.cachedPtrs_[i] = nullptr; } #else (void)other; #endif } DISPENSO_INLINE void swapCachedPtrs(ConcurrentVector& other) { #if DISPENSO_HAS_CACHED_PTRS for (size_t i = 0; i < kMaxBuffers; ++i) { T* cur = cachedPtrs_[i]; cachedPtrs_[i] = other.cachedPtrs_[i]; other.cachedPtrs_[i] = cur; } #else (void)other; #endif } friend class cv::ConVecIterBase, T>; friend class cv::ConcurrentVectorIterator, T, false>; friend class cv::ConcurrentVectorIterator, T, true>; }; template inline bool operator==( const ConcurrentVector& a, const ConcurrentVector& b) { return a.size() == b.size() && std::equal(a.begin(), a.end(), b.begin()); } template inline bool operator!=( const ConcurrentVector& a, const ConcurrentVector& b) { return !(a == b); } template inline bool operator<( const ConcurrentVector& a, const ConcurrentVector& b) { return std::lexicographical_compare(a.begin(), a.end(), b.begin(), b.end()); } template inline bool operator>( const ConcurrentVector& a, const ConcurrentVector& b) { return b < a; } template inline bool operator<=( const ConcurrentVector& a, const ConcurrentVector& b) { return !(b < a); } template inline bool operator>=( const ConcurrentVector& a, const ConcurrentVector& b) { return !(a < b); } template inline void swap(ConcurrentVector& a, ConcurrentVector& b) { a.swap(b); } // Textual inclusion. Includes undocumented implementation details, e.g. iterators. #include } // namespace dispenso ================================================ FILE: dispenso/detail/can_invoke.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include namespace dispenso { namespace detail { template using void_t = void; template struct CanInvoke : std::false_type {}; template struct CanInvoke()(std::declval()...))>> : std::true_type {}; } // namespace detail } // namespace dispenso ================================================ FILE: dispenso/detail/completion_event_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include // Implementation notes: // 1. You should know what you're doing if you are directly using CompletionEventImpl. Most people // should be using CompletionEvent, which is designed for safe use by folks without strong // knowledge of atomics, etc... // 2. intrusiveStatus may be used to gain access to the internal atomic status // 3. intrusiveStatus may set the status to anything... *but* once wait functions have been called, // if intrusiveStatus sets the status to completedStatus, it should not change the status again. // After notify, the status should not be modified. // 4. It is not an error to set the status to completedStatus, and then call notify with // completedStatus, but it is very likely to be unnecessary. Usually, while there may be threads // racing to get some statuses, and they should use compare_exchange functions to resolve those, // setting completed status should not typically be racy. #include "notifier_common.h" namespace dispenso { namespace detail { #define DISPENSO_COMPLETION_SUPPORTS_TRY_NOTIFY #if defined(__linux__) class CompletionEventImpl { public: CompletionEventImpl(int initStatus) : ftx_(initStatus) {} void notify(int completedStatus) { status_.store(completedStatus, std::memory_order_release); futex(&ftx_, FUTEX_WAKE_PRIVATE, std::numeric_limits::max(), nullptr, nullptr, 0); } void tryNotify() { futex(&ftx_, FUTEX_WAKE_PRIVATE, std::numeric_limits::max(), nullptr, nullptr, 0); } void wait(int completedStatus) const { int current; while ((current = status_.load(std::memory_order_acquire)) != completedStatus) { futex(&ftx_, FUTEX_WAIT_PRIVATE, current, nullptr, nullptr, 0); } } bool waitFor(int completedStatus, const std::chrono::duration& relTime) const { if (status_.load(std::memory_order_acquire) == completedStatus) { return true; } double relSeconds = relTime.count(); if (relSeconds <= 0.0) { return false; } struct timespec ts; ts.tv_sec = static_cast(relSeconds); relSeconds -= static_cast(ts.tv_sec); ts.tv_nsec = static_cast(1e9 * relSeconds); // TODO: determine if we should worry about reducing timeout time subsequent times through the // loop in the case of spurious wake. int current; while ((current = status_.load(std::memory_order_acquire)) != completedStatus) { if (futex(&ftx_, FUTEX_WAIT_PRIVATE, current, &ts, nullptr, 0) && errno == ETIMEDOUT) { // Intentionally not re-checking status: returning false on timeout is consistent with // std::condition_variable::wait_for semantics. The benign race where notify arrives // concurrently with timeout is handled by the caller retrying if needed. return false; } } return true; } template bool waitUntil(int completedStatus, const std::chrono::time_point& absTime) const { if (status_.load(std::memory_order_acquire) == completedStatus) { return true; } return waitFor(completedStatus, absTime - Clock::now()); } std::atomic& intrusiveStatus() { return status_; } const std::atomic& intrusiveStatus() const { return status_; } private: union { mutable int ftx_; std::atomic status_; }; }; #elif defined(__MACH__) && defined(DISPENSO_HAS_OS_SYNC) // Uses a 64-bit combined {status, waiterCount} to eliminate the Dekker race between notify() and // wait(). notify() atomically sets status while observing waiterCount via CAS; wait() atomically // increments waiterCount while observing status via CAS. The 8-byte futex compare detects changes // to either field. intrusiveStatus() returns a reference to the status portion (lower 32 bits on // little-endian) for direct CAS/fetch_sub by FutureImplBase and Latch. class CompletionEventImpl { public: CompletionEventImpl(int initStatus) : combined_(static_cast(static_cast(initStatus))) {} void notify(int completedStatus) { uint64_t old = combined_.load(std::memory_order_relaxed); uint64_t newVal; do { newVal = (old & kWaiterMask) | static_cast(completedStatus); } while (!combined_.compare_exchange_weak( old, newVal, std::memory_order_release, std::memory_order_relaxed)); if (old & kWaiterMask) { mac_futex_wake_all(&combined_, sizeof(uint64_t)); } } void tryNotify() { if (combined_.load(std::memory_order_acquire) & kWaiterMask) { mac_futex_wake_all(&combined_, sizeof(uint64_t)); } } void wait(int completedStatus) const { uint64_t val = combined_.load(std::memory_order_acquire); while (static_cast(val) != completedStatus) { uint64_t newVal = val + kOneWaiter; if (!combined_.compare_exchange_weak( val, newVal, std::memory_order_acq_rel, std::memory_order_acquire)) { continue; } mac_futex_wait(&combined_, newVal, sizeof(uint64_t)); combined_.fetch_sub(kOneWaiter, std::memory_order_acq_rel); val = combined_.load(std::memory_order_acquire); } } bool waitFor(int completedStatus, const std::chrono::duration& relTime) const { if (static_cast(combined_.load(std::memory_order_acquire)) == completedStatus) { return true; } double relSeconds = relTime.count(); if (relSeconds <= 0.0) { return false; } uint64_t relTimeUs = static_cast(relSeconds * 1e6); // TODO: determine if we should worry about reducing timeout time subsequent times through the // loop in the case of spurious wake. uint64_t val = combined_.load(std::memory_order_acquire); while (static_cast(val) != completedStatus) { uint64_t newVal = val + kOneWaiter; if (!combined_.compare_exchange_weak( val, newVal, std::memory_order_acq_rel, std::memory_order_acquire)) { continue; } int ret = mac_futex_wait_for(&combined_, newVal, sizeof(uint64_t), relTimeUs); combined_.fetch_sub(kOneWaiter, std::memory_order_acq_rel); if (ret && errno == ETIMEDOUT) { // Intentionally not re-checking status: returning false on timeout is consistent with // std::condition_variable::wait_for semantics. The benign race where notify arrives // concurrently with timeout is handled by the caller retrying if needed. return false; } val = combined_.load(std::memory_order_acquire); } return true; } template bool waitUntil(int completedStatus, const std::chrono::time_point& absTime) const { if (static_cast(combined_.load(std::memory_order_acquire)) == completedStatus) { return true; } return waitFor(completedStatus, absTime - Clock::now()); } std::atomic& intrusiveStatus() { return parts_[0]; } const std::atomic& intrusiveStatus() const { return parts_[0]; } private: static constexpr uint64_t kOneWaiter = 1ULL << 32; static constexpr uint64_t kWaiterMask = ~0xFFFFFFFFULL; union { mutable std::atomic combined_; mutable std::atomic parts_[2]; // [0] = status (little-endian) }; }; #elif defined(_WIN32) class CompletionEventImpl { public: CompletionEventImpl(int initStatus) : status_(initStatus) {} void notify(int completedStatus) { status_.store(completedStatus, std::memory_order_release); WakeByAddressAll(&status_); } void tryNotify() { WakeByAddressAll(&status_); } void wait(int completedStatus) const { int current; while ((current = status_.load(std::memory_order_acquire)) != completedStatus) { WaitOnAddress(&status_, ¤t, sizeof(int), kInfiniteWin); } } bool waitFor(int completedStatus, const std::chrono::duration& relTime) const { if (status_.load(std::memory_order_acquire) == completedStatus) { return true; } double relSeconds = relTime.count(); if (relSeconds <= 0.0) { return false; } int msWait = std::max(1, static_cast(relSeconds * 1000.0)); // TODO: determine if we should worry about reducing timeout time subsequent times through the // loop in the case of spurious wake. int current; while ((current = status_.load(std::memory_order_acquire)) != completedStatus) { if (!WaitOnAddress(&status_, ¤t, sizeof(int), msWait) && GetLastError() == kErrorTimeoutWin) { // Intentionally not re-checking status: returning false on timeout is consistent with // std::condition_variable::wait_for semantics. The benign race where notify arrives // concurrently with timeout is handled by the caller retrying if needed. return false; } } return true; } template bool waitUntil(int completedStatus, const std::chrono::time_point& absTime) const { if (status_.load(std::memory_order_acquire) == completedStatus) { return true; } return waitFor(completedStatus, absTime - Clock::now()); } std::atomic& intrusiveStatus() { return status_; } const std::atomic& intrusiveStatus() const { return status_; } private: mutable std::atomic status_; }; #else #undef DISPENSO_COMPLETION_SUPPORTS_TRY_NOTIFY // Fallback C++11 implementation. class CompletionEventImpl { public: CompletionEventImpl(int initStatus) : status_(initStatus) {} void notify(int completedStatus) { // See https://en.cppreference.com/w/cpp/thread/condition_variable // "Even if the shared variable is atomic, it must be modified under the mutex in order to // correctly publish the modification to the waiting thread." { std::lock_guard lk(mtx_); status_.store(completedStatus, std::memory_order_release); } cv_.notify_all(); } void wait(int completedStatus) const { if (status_.load(std::memory_order_acquire) != completedStatus) { std::unique_lock lk(mtx_); cv_.wait(lk, [this, completedStatus]() { return status_.load(std::memory_order_relaxed) == completedStatus; }); } } template bool waitFor(int completedStatus, const std::chrono::duration& relTime) const { if (status_.load(std::memory_order_acquire) == completedStatus) { return true; } std::unique_lock lk(mtx_); return cv_.wait_for(lk, relTime, [this, completedStatus] { return status_.load(std::memory_order_relaxed) == completedStatus; }); } template bool waitUntil(int completedStatus, const std::chrono::time_point& absTime) const { if (status_.load(std::memory_order_acquire) == completedStatus) { return true; } std::unique_lock lk(mtx_); return cv_.wait_until(lk, absTime, [this, completedStatus] { return status_.load(std::memory_order_relaxed) == completedStatus; }); } std::atomic& intrusiveStatus() { return status_; } const std::atomic& intrusiveStatus() const { return status_; } private: mutable std::mutex mtx_; mutable std::condition_variable cv_; std::atomic status_; }; #endif // platform } // namespace detail } // namespace dispenso ================================================ FILE: dispenso/detail/concurrent_vector_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ // This file intended for textual inclusion into concurrent_vector.h only namespace cv { struct BucketInfo { size_t bucket; size_t bucketIndex; size_t bucketCapacity; }; template class ConVecIterBase { public: using iterator_category = std::random_access_iterator_tag; using difference_type = ssize_t; using value_type = T; ConVecIterBase() = default; ConVecIterBase(const VecT* vec, BucketInfo info); protected: ConVecIterBase(uintptr_t vb, T* bucketStart, T* bucketPtr, T* bucketEnd) : vb_(vb), bucketStart_(bucketStart), bucketPtr_(bucketPtr), bucketEnd_(bucketEnd) {} struct VecAndBucket { const VecT* vec; const size_t bucket; }; VecAndBucket getVecAndBucket() const { static_assert( alignof(VecT) >= 64, "This code relies on the ConcurrentVector pointer to be 64-byte aligned"); if (sizeof(uintptr_t) == 8) { return {reinterpret_cast(vb_ & 0xffffffffffffffc0UL), vb_ & 0x3f}; } else { return {reinterpret_cast(vb_ & 0xffffffc0), vb_ & 0x3f}; } } // Note: Could have a size-optimized iterator via traits. It is easy to reduce this to 24 bytes // on 64-bit platform, but this will result in a large enough performance hit that storing an // index is preferable (see CompactCVecIterator). 16 bytes on 32-bit platform. alignas(4 * sizeof(uintptr_t)) uintptr_t vb_; T* bucketStart_; T* bucketPtr_; T* bucketEnd_; private: friend bool operator!=(const ConVecIterBase& a, const ConVecIterBase& b) { return a.bucketPtr_ != b.bucketPtr_; } friend bool operator==(const ConVecIterBase& a, const ConVecIterBase& b) { return a.bucketPtr_ == b.bucketPtr_; } friend bool operator<(const ConVecIterBase& a, const ConVecIterBase& b) { return a.vb_ < b.vb_ || (a.vb_ == b.vb_ && a.bucketPtr_ < b.bucketPtr_); } friend bool operator<=(const ConVecIterBase& a, const ConVecIterBase& b) { return a.vb_ < b.vb_ || (a.vb_ == b.vb_ && a.bucketPtr_ <= b.bucketPtr_); } friend bool operator>(const ConVecIterBase& a, const ConVecIterBase& b) { return a.vb_ > b.vb_ || (a.vb_ == b.vb_ && a.bucketPtr_ > b.bucketPtr_); } friend bool operator>=(const ConVecIterBase& a, const ConVecIterBase& b) { return a.vb_ > b.vb_ || (a.vb_ == b.vb_ && a.bucketPtr_ >= b.bucketPtr_); } friend difference_type operator-(const ConVecIterBase& a, const ConVecIterBase& b) { size_t abucket = a.vb_ & 0x3f; size_t bbucket = b.vb_ & 0x3f; if (abucket == bbucket) { return a.bucketPtr_ - b.bucketPtr_; } ssize_t aLeft = a.bucketPtr_ - a.bucketStart_; ssize_t bLeft = b.bucketPtr_ - b.bucketStart_; aLeft += (bool)abucket * (a.bucketEnd_ - a.bucketStart_); bLeft += (bool)bbucket * (b.bucketEnd_ - b.bucketStart_); return aLeft - bLeft; } }; template class ConcurrentVectorIterator : public ConVecIterBase { public: using iterator_category = typename ConVecIterBase::iterator_category; using difference_type = typename ConVecIterBase::difference_type; using value_type = typename ConVecIterBase::value_type; using pointer = std::conditional_t, T*>; using reference = std::conditional_t, T&>; ConcurrentVectorIterator() = default; ConcurrentVectorIterator(const VecT* vec, BucketInfo info) : ConVecIterBase(vec, info) {} ConcurrentVectorIterator(const VecT* vec, size_t /*index*/, BucketInfo info) : ConcurrentVectorIterator(vec, info) {} template > ConcurrentVectorIterator(const ConcurrentVectorIterator& other) : ConVecIterBase(other) {} reference operator*() const; pointer operator->() const; reference operator[](difference_type n) const; ConcurrentVectorIterator& operator++(); ConcurrentVectorIterator operator++(int) { auto result = *this; operator++(); return result; } ConcurrentVectorIterator& operator--(); ConcurrentVectorIterator operator--(int) { auto result = *this; operator--(); return result; } ConcurrentVectorIterator& operator+=(difference_type n); ConcurrentVectorIterator& operator-=(difference_type n) { return operator+=(-n); } ConcurrentVectorIterator operator+(difference_type n) const; ConcurrentVectorIterator operator-(difference_type n) const { return operator+(-n); } private: ConcurrentVectorIterator(uintptr_t vb, T* bucketStart, T* bucketPtr, T* bucketEnd) : ConVecIterBase(vb, bucketStart, bucketPtr, bucketEnd) {} using ConVecIterBase::getVecAndBucket; using ConVecIterBase::vb_; using ConVecIterBase::bucketStart_; using ConVecIterBase::bucketPtr_; using ConVecIterBase::bucketEnd_; }; template class CompactCVecIterBase { public: using iterator_category = std::random_access_iterator_tag; using difference_type = ssize_t; using value_type = T; CompactCVecIterBase() = default; CompactCVecIterBase(const VecT* vec, size_t index) : vec_(vec), index_(index) {} protected: // 16 bytes on 64-bit platform, 8 bytes on 32-bit platform. const VecT* vec_; size_t index_; private: friend bool operator!=(const CompactCVecIterBase& a, const CompactCVecIterBase& b) { return a.index_ != b.index_; } friend bool operator==(const CompactCVecIterBase& a, const CompactCVecIterBase& b) { return a.index_ == b.index_; } friend bool operator<(const CompactCVecIterBase& a, const CompactCVecIterBase& b) { return a.index_ < b.index_; } friend bool operator<=(const CompactCVecIterBase& a, const CompactCVecIterBase& b) { return a.index_ <= b.index_; } friend bool operator>(const CompactCVecIterBase& a, const CompactCVecIterBase& b) { return a.index_ > b.index_; } friend bool operator>=(const CompactCVecIterBase& a, const CompactCVecIterBase& b) { return a.index_ >= b.index_; } friend difference_type operator-(const CompactCVecIterBase& a, const CompactCVecIterBase& b) { return a.index_ - b.index_; } }; template class CompactCVecIterator : public CompactCVecIterBase { public: using difference_type = ssize_t; using value_type = T; using pointer = std::conditional_t, T*>; using reference = std::conditional_t, T&>; using iterator_category = std::random_access_iterator_tag; CompactCVecIterator() = default; CompactCVecIterator(const VecT* vec, size_t index, BucketInfo) : CompactCVecIterator(vec, index) {} CompactCVecIterator(const VecT* vec, size_t index) : CompactCVecIterBase(vec, index) {} template > CompactCVecIterator(const CompactCVecIterator& other) : CompactCVecIterBase(other) {} reference operator*() const; pointer operator->() const; reference operator[](difference_type n) const; CompactCVecIterator& operator++() { ++index_; return *this; } CompactCVecIterator operator++(int) { auto result = *this; operator++(); return result; } CompactCVecIterator& operator--() { --index_; return *this; } CompactCVecIterator operator--(int) { auto result = *this; operator--(); return result; } CompactCVecIterator& operator+=(difference_type n) { index_ += n; return *this; } CompactCVecIterator& operator-=(difference_type n) { return operator+=(-n); } CompactCVecIterator operator+(difference_type n) const { return {vec_, index_ + n}; } CompactCVecIterator operator-(difference_type n) const { return operator+(-n); } private: using CompactCVecIterBase::vec_; using CompactCVecIterBase::index_; }; template inline T* alloc(size_t elts) { return reinterpret_cast(detail::alignedMalloc(elts * sizeof(T), alignof(T))); } template inline void dealloc(T* p) { detail::alignedFree(p); } template class ConVecBufferBase {}; template class ConVecBufferBase { public: static constexpr size_t kMaxBuffers = detail::log2const(kMaxVectorSize / kMinBufferSize) + 1; ConVecBufferBase() : buffers_(alloc>(kMaxBuffers)) { for (size_t i = 0; i < kMaxBuffers; ++i) { buffers_[i].store(nullptr, std::memory_order_relaxed); } } ~ConVecBufferBase() { dealloc>(buffers_); } ConVecBufferBase(ConVecBufferBase&& other) : buffers_(std::exchange(other.buffers_, alloc>(kMaxBuffers))) { for (size_t i = 0; i < kMaxBuffers; ++i) { other.buffers_[i].store(nullptr, std::memory_order_relaxed); } } ConVecBufferBase& operator=(ConVecBufferBase&& other) { using std::swap; swap(other.buffers_, buffers_); return *this; } protected: detail::AlignedAtomic* buffers_; }; template class ConVecBufferBase { public: static constexpr size_t kMaxBuffers = detail::log2const(kMaxVectorSize / kMinBufferSize) + 1; ConVecBufferBase() { for (size_t i = 0; i < kMaxBuffers; ++i) { buffers_[i].store(nullptr, std::memory_order_relaxed); } } ConVecBufferBase(ConVecBufferBase&& other) { for (size_t i = 0; i < kMaxBuffers; ++i) { buffers_[i].store( other.buffers_[i].load(std::memory_order_relaxed), std::memory_order_relaxed); other.buffers_[i].store(nullptr, std::memory_order_relaxed); } } ConVecBufferBase& operator=(ConVecBufferBase&& other) { for (size_t i = 0; i < kMaxBuffers; ++i) { T* cur = buffers_[i].load(std::memory_order_relaxed); T* oth = other.buffers_[i].load(std::memory_order_relaxed); buffers_[i].store(oth, std::memory_order_relaxed); other.buffers_[i].store(cur, std::memory_order_relaxed); } return *this; } protected: detail::AlignedAtomic buffers_[kMaxBuffers]; }; template < typename T, size_t kMinBufferSize, size_t kMaxVectorSize, bool kMakeInline, ConcurrentVectorReallocStrategy kStrategy> class ConVecBuffer : public ConVecBufferBase { public: ConVecBuffer() { std::memset(shouldDealloc_, 0, BaseType::kMaxBuffers * sizeof(bool)); } ConVecBuffer(ConVecBuffer&& other) : BaseType(std::move(other)) { std::memcpy(shouldDealloc_, other.shouldDealloc_, BaseType::kMaxBuffers * sizeof(bool)); std::memset(other.shouldDealloc_, 0, BaseType::kMaxBuffers * sizeof(bool)); } ConVecBuffer& operator=(ConVecBuffer&& other) { if (&other != this) { BaseType::operator=(std::move(other)); std::swap_ranges( other.shouldDealloc_, other.shouldDealloc_ + BaseType::kMaxBuffers, shouldDealloc_); } return *this; } detail::AlignedAtomic& operator[](size_t bucket) { return this->buffers_[bucket]; } const detail::AlignedAtomic& operator[](size_t bucket) const { return this->buffers_[bucket]; } void allocAsNecessary(const BucketInfo& binfo) { allocAsNecessaryImpl(binfo, [](size_t, T*) {}); } void allocAsNecessary(const BucketInfo& binfo, T** cachedPtrs) { allocAsNecessaryImpl(binfo, [cachedPtrs](size_t b, T* p) { cachedPtrs[b] = p; }); } void allocAsNecessary(const BucketInfo& binfo, ssize_t rangeLen, const BucketInfo& bend) { allocAsNecessaryImpl(binfo, rangeLen, bend, [](size_t, T*) {}); } void allocAsNecessary( const BucketInfo& binfo, ssize_t rangeLen, const BucketInfo& bend, T** cachedPtrs) { allocAsNecessaryImpl( binfo, rangeLen, bend, [cachedPtrs](size_t b, T* p) { cachedPtrs[b] = p; }); } bool shouldDealloc(size_t bucket) const { return shouldDealloc_[bucket]; } private: using BaseType = ConVecBufferBase; DISPENSO_INLINE static size_t allocCheckIndex(size_t bucketCapacity) { return (kStrategy == ConcurrentVectorReallocStrategy::kFullBufferAhead) ? 0 : (kStrategy == ConcurrentVectorReallocStrategy::kHalfBufferAhead ? bucketCapacity / 2 : bucketCapacity - 1); } template void allocAsNecessaryImpl(const BucketInfo& binfo, CacheUpdate&& cacheUpdate) { if (DISPENSO_EXPECT(binfo.bucketIndex == allocCheckIndex(binfo.bucketCapacity), 0)) { if (!this->buffers_[binfo.bucket + 1].load(std::memory_order_acquire)) { T* newBuf = cv::alloc(binfo.bucketCapacity << 1); cacheUpdate(binfo.bucket + 1, newBuf); this->buffers_[binfo.bucket + 1].store(newBuf, std::memory_order_release); shouldDealloc_[binfo.bucket + 1] = true; } } while (DISPENSO_EXPECT(!this->buffers_[binfo.bucket].load(std::memory_order_acquire), 0)) { } } template DISPENSO_INLINE bool tryAssignBuffer( size_t bucket, T*& allocBufs, size_t cap, bool firstAccounted, CacheUpdate&& cacheUpdate) { if (!this->buffers_[bucket].load(std::memory_order_acquire)) { cacheUpdate(bucket, allocBufs); this->buffers_[bucket].store(allocBufs, std::memory_order_release); allocBufs += cap; shouldDealloc_[bucket] = !firstAccounted; return true; } return false; } template void allocAsNecessaryImpl( const BucketInfo& binfo, ssize_t rangeLen, const BucketInfo& bend, CacheUpdate&& cacheUpdate) { const size_t indexToCheck = allocCheckIndex(binfo.bucketCapacity); bool allocCurrentBucket = binfo.bucketIndex <= indexToCheck && binfo.bucketIndex + rangeLen > indexToCheck; if (DISPENSO_EXPECT(allocCurrentBucket || binfo.bucket < bend.bucket, 0)) { size_t sizeToAlloc = 0; size_t cap = binfo.bucketCapacity << ((bool)binfo.bucket + !allocCurrentBucket); size_t bucket = binfo.bucket + 1 + !allocCurrentBucket; for (; bucket <= bend.bucket; ++bucket, cap <<= 1) { if (!this->buffers_[bucket].load(std::memory_order_acquire)) { sizeToAlloc += cap; } } assert(bucket == bend.bucket + 1); assert((bucket == 1 && cap == bend.bucketCapacity) || cap == bend.bucketCapacity * 2); const size_t endToCheck = allocCheckIndex(bend.bucketCapacity); if (DISPENSO_EXPECT(bend.bucketIndex > endToCheck, 0)) { if (!this->buffers_[bucket].load(std::memory_order_acquire)) { sizeToAlloc += cap; } } T* allocBufs = nullptr; if (sizeToAlloc) { allocBufs = cv::alloc(sizeToAlloc); } bool firstAccounted = false; cap = binfo.bucketCapacity << ((bool)binfo.bucket + !allocCurrentBucket); bucket = binfo.bucket + 1 + !allocCurrentBucket; for (; bucket <= bend.bucket; ++bucket, cap <<= 1) { firstAccounted |= tryAssignBuffer(bucket, allocBufs, cap, firstAccounted, cacheUpdate); } assert(bucket == bend.bucket + 1); assert((bucket == 1 && cap == bend.bucketCapacity) || cap == bend.bucketCapacity * 2); if (DISPENSO_EXPECT(bend.bucketIndex > endToCheck, 0)) { tryAssignBuffer(bucket, allocBufs, cap, firstAccounted, cacheUpdate); } } for (size_t bucket = binfo.bucket; bucket <= bend.bucket; ++bucket) { while (DISPENSO_EXPECT(!this->buffers_[bucket].load(std::memory_order_acquire), 0)) { #if defined(DISPENSO_HAS_TSAN) std::this_thread::sleep_for(std::chrono::microseconds(1)); #endif // DISPENSO_HAS_TSAN } } } bool shouldDealloc_[BaseType::kMaxBuffers]; }; } // namespace cv ================================================ FILE: dispenso/detail/concurrent_vector_impl2.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ // This file intended for textual inclusion into concurrent_vector.h only namespace cv { template DISPENSO_INLINE ConVecIterBase::ConVecIterBase(const VecT* vec, cv::BucketInfo info) : vb_(reinterpret_cast(vec) | info.bucket), bucketStart_(vec->buffers_[info.bucket].load(std::memory_order_relaxed)), bucketPtr_(bucketStart_ + info.bucketIndex), bucketEnd_(bucketStart_ + info.bucketCapacity) {} template DISPENSO_INLINE ConcurrentVectorIterator& ConcurrentVectorIterator::operator++() { ++bucketPtr_; if (bucketPtr_ == bucketEnd_) { auto len = bucketEnd_ - bucketStart_; ++vb_; auto vb = getVecAndBucket(); len <<= int{vb.bucket > 1}; bucketPtr_ = bucketStart_ = vb.vec->cachedBuffer(vb.bucket); bucketEnd_ = bucketPtr_ + len; } return *this; } template DISPENSO_INLINE ConcurrentVectorIterator& ConcurrentVectorIterator::operator--() { --bucketPtr_; if (bucketPtr_ < bucketStart_) { auto vb = getVecAndBucket(); if (vb.bucket) { auto len = bucketEnd_ - bucketStart_; --vb_; len >>= int{vb.bucket > 1}; bucketStart_ = vb.vec->cachedBuffer(vb.bucket - 1); bucketPtr_ = bucketStart_ + len; bucketEnd_ = bucketPtr_; --bucketPtr_; } } return *this; } template DISPENSO_INLINE typename ConcurrentVectorIterator::reference ConcurrentVectorIterator::operator*() const { return *bucketPtr_; } template DISPENSO_INLINE typename ConcurrentVectorIterator::pointer ConcurrentVectorIterator::operator->() const { return &operator*(); } template DISPENSO_INLINE typename ConcurrentVectorIterator::reference ConcurrentVectorIterator::operator[](difference_type n) const { T* nPtr = bucketPtr_ + n; if (nPtr >= bucketStart_ && nPtr < bucketEnd_) { return *nPtr; } auto vb = getVecAndBucket(); // Reconstruct index ssize_t oldIndex = bucketPtr_ - bucketStart_; oldIndex += (bool)vb.bucket * (bucketEnd_ - bucketStart_); auto binfo = vb.vec->bucketAndSubIndexForIndex(oldIndex + n); return *(vb.vec->cachedBuffer(binfo.bucket) + binfo.bucketIndex); } template DISPENSO_INLINE ConcurrentVectorIterator& ConcurrentVectorIterator::operator+=(difference_type n) { T* nPtr = bucketPtr_ + n; if (nPtr >= bucketStart_ && nPtr < bucketEnd_) { bucketPtr_ = nPtr; return *this; } auto vb = getVecAndBucket(); // Reconstruct index ssize_t oldIndex = bucketPtr_ - bucketStart_; oldIndex += (bool)vb.bucket * (bucketEnd_ - bucketStart_); auto binfo = vb.vec->bucketAndSubIndexForIndex(oldIndex + n); bucketStart_ = vb.vec->cachedBuffer(binfo.bucket); bucketEnd_ = bucketStart_ + binfo.bucketCapacity; bucketPtr_ = bucketStart_ + binfo.bucketIndex; vb_ = reinterpret_cast(vb.vec) | binfo.bucket; return *this; } template DISPENSO_INLINE ConcurrentVectorIterator ConcurrentVectorIterator::operator+(difference_type n) const { T* nPtr = bucketPtr_ + n; if (nPtr >= bucketStart_ && nPtr < bucketEnd_) { return {vb_, bucketStart_, nPtr, bucketEnd_}; } auto vb = getVecAndBucket(); // Reconstruct index ssize_t oldIndex = bucketPtr_ - bucketStart_; oldIndex += (bool)vb.bucket * (bucketEnd_ - bucketStart_); auto binfo = vb.vec->bucketAndSubIndexForIndex(oldIndex + n); return {vb.vec, binfo}; } template DISPENSO_INLINE typename CompactCVecIterator::reference CompactCVecIterator::operator*() const { return const_cast(*vec_)[index_]; } template DISPENSO_INLINE typename CompactCVecIterator::pointer CompactCVecIterator::operator->() const { return &operator*(); } template DISPENSO_INLINE typename CompactCVecIterator::reference CompactCVecIterator::operator[](ssize_t n) const { return const_cast(*vec_)[index_ + n]; } } // namespace cv ================================================ FILE: dispenso/detail/epoch_waiter.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include "notifier_common.h" namespace dispenso { namespace detail { #if defined(__linux__) class EpochWaiter { public: EpochWaiter() : ftx_(0) {} void bumpAndWake() { epoch_.fetch_add(1, std::memory_order_acq_rel); futex(&ftx_, FUTEX_WAKE_PRIVATE, 1, nullptr, nullptr, 0); } void bumpAndWakeAll() { epoch_.fetch_add(1, std::memory_order_acq_rel); futex(&ftx_, FUTEX_WAKE_PRIVATE, std::numeric_limits::max(), nullptr, nullptr, 0); } void bumpAndWakeN(int n, int totalWaiters) { epoch_.fetch_add(1, std::memory_order_acq_rel); (void)totalWaiters; futex(&ftx_, FUTEX_WAKE_PRIVATE, n, nullptr, nullptr, 0); } uint32_t wait(uint32_t expectedEpoch) const { uint32_t current; // allow spurious wakeups if ((current = epoch_.load(std::memory_order_acquire)) == expectedEpoch) { futex(&ftx_, FUTEX_WAIT_PRIVATE, expectedEpoch, nullptr, nullptr, 0); } else { return current; } return epoch_.load(std::memory_order_acquire); } uint32_t current() const { return epoch_.load(std::memory_order_acquire); } uint32_t waitFor(uint32_t expectedEpoch, uint64_t relTimeUs) const { uint32_t current; if ((current = epoch_.load(std::memory_order_acquire)) != expectedEpoch) { return current; } struct timespec ts; ts.tv_sec = static_cast(relTimeUs / 1000000); ts.tv_nsec = static_cast((relTimeUs - (ts.tv_sec * 1000000)) * 1000); // allow spurious wakeups if ((current = epoch_.load(std::memory_order_acquire)) == expectedEpoch) { futex(&ftx_, FUTEX_WAIT_PRIVATE, current, &ts, nullptr, 0); } else { return current; } return epoch_.load(std::memory_order_acquire); } private: union { mutable int ftx_; std::atomic epoch_; }; }; #elif defined(__MACH__) && defined(DISPENSO_HAS_MAC_FUTEX) class EpochWaiter { public: EpochWaiter() : epoch_(0) {} void bumpAndWake() { epoch_.fetch_add(1, std::memory_order_acq_rel); mac_futex_wake_one(&epoch_, sizeof(uint32_t)); } void bumpAndWakeAll() { epoch_.fetch_add(1, std::memory_order_acq_rel); mac_futex_wake_all(&epoch_, sizeof(uint32_t)); } void bumpAndWakeN(int n, int totalWaiters) { epoch_.fetch_add(1, std::memory_order_acq_rel); if (n >= totalWaiters) { mac_futex_wake_all(&epoch_, sizeof(uint32_t)); } else { for (int i = 0; i < n; ++i) { mac_futex_wake_one(&epoch_, sizeof(uint32_t)); } } } uint32_t wait(uint32_t expectedEpoch) const { uint32_t current; // Allow spurious wake if ((current = epoch_.load(std::memory_order_acquire)) == expectedEpoch) { mac_futex_wait(&epoch_, expectedEpoch, sizeof(uint32_t)); } else { return current; } return epoch_.load(std::memory_order_acquire); } uint32_t current() const { return epoch_.load(std::memory_order_acquire); } uint32_t waitFor(uint32_t expectedEpoch, uint64_t relTimeUs) const { uint32_t current; if ((current = epoch_.load(std::memory_order_acquire)) != expectedEpoch) { return current; } // Allow spurious wake if ((current = epoch_.load(std::memory_order_acquire)) == expectedEpoch) { mac_futex_wait_for(&epoch_, current, sizeof(uint32_t), relTimeUs); } else { return current; } return epoch_.load(std::memory_order_acquire); } private: mutable std::atomic epoch_; }; #elif defined(_WIN32) class EpochWaiter { public: EpochWaiter() : epoch_(0) {} void bumpAndWake() { epoch_.fetch_add(1, std::memory_order_acq_rel); // Always WakeAll on Windows. WakeByAddressSingle and WakeByAddressAll // have comparable per-call kernel cost, but WakeAll keeps more threads // in their spin phase (kBackoffYield iterations) where they can pick up // subsequent work without another kernel wake transition. WakeByAddressAll(&epoch_); } void bumpAndWakeAll() { epoch_.fetch_add(1, std::memory_order_acq_rel); WakeByAddressAll(&epoch_); } void bumpAndWakeN(int /*n*/, int totalWaiters) { epoch_.fetch_add(1, std::memory_order_acq_rel); // Only use WakeByAddressSingle when there is exactly one waiter — // it avoids waking threads that aren't waiting. Otherwise, WakeAll // is preferred: same single kernel transition, and keeps threads // spinning where they can absorb follow-up work without re-waking. if (totalWaiters <= 1) { WakeByAddressSingle(&epoch_); } else { WakeByAddressAll(&epoch_); } } uint32_t wait(uint32_t expectedEpoch) const { uint32_t current; // Allow spurious wake if ((current = epoch_.load(std::memory_order_acquire)) == expectedEpoch) { WaitOnAddress(&epoch_, ¤t, sizeof(uint32_t), kInfiniteWin); } else { return current; } return epoch_.load(std::memory_order_acquire); } uint32_t current() const { return epoch_.load(std::memory_order_acquire); } uint32_t waitFor(uint32_t expectedEpoch, uint64_t relTimeUs) const { uint32_t current; if ((current = epoch_.load(std::memory_order_acquire)) != expectedEpoch) { return current; } // WaitOnAddress takes DWORD (unsigned long) milliseconds. // Clamp to the largest finite timeout to avoid overflow or // accidentally passing INFINITE (0xFFFFFFFF = wait forever). uint64_t msVal = std::max(uint64_t{1}, relTimeUs / 1000); unsigned long msWait = msVal >= kInfiniteWin ? static_cast(kInfiniteWin - 1) : static_cast(msVal); // Allow spurious wake if ((current = epoch_.load(std::memory_order_acquire)) == expectedEpoch) { WaitOnAddress(&epoch_, ¤t, sizeof(uint32_t), msWait); } else { return current; } return epoch_.load(std::memory_order_acquire); } private: mutable std::atomic epoch_; }; #else // Fallback C++11 implementation. class EpochWaiter { public: EpochWaiter() : epoch_(0) {} void bumpAndWake() { epoch_.fetch_add(1, std::memory_order_acq_rel); cv_.notify_one(); } void bumpAndWakeAll() { epoch_.fetch_add(1, std::memory_order_acq_rel); cv_.notify_all(); } void bumpAndWakeN(int n, int totalWaiters) { epoch_.fetch_add(1, std::memory_order_acq_rel); if (n >= totalWaiters) { cv_.notify_all(); } else { for (int i = 0; i < n; ++i) { cv_.notify_one(); } } } uint32_t wait(uint32_t expectedEpoch) const { uint32_t current; // Allow spurious wake if ((current = epoch_.load(std::memory_order_acquire)) == expectedEpoch) { std::unique_lock lk(mtx_); cv_.wait(lk); } else { return current; } return epoch_.load(std::memory_order_acquire); } uint32_t current() const { return epoch_.load(std::memory_order_acquire); } uint32_t waitFor(uint32_t expectedEpoch, uint64_t relTimeUs) const { uint32_t current; if ((current = epoch_.load(std::memory_order_acquire)) != expectedEpoch) { return current; } // Allow spurious wake if ((current = epoch_.load(std::memory_order_acquire)) == expectedEpoch) { std::unique_lock lk(mtx_); cv_.wait_for(lk, std::chrono::microseconds(static_cast(relTimeUs))); } else { return current; } return epoch_.load(std::memory_order_acquire); } private: mutable std::mutex mtx_; mutable std::condition_variable cv_; std::atomic epoch_; }; #endif // platform } // namespace detail } // namespace dispenso ================================================ FILE: dispenso/detail/future_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include namespace dispenso { namespace detail { template class FutureImplResultMember { public: Result& result() const { #if defined(__cpp_exceptions) if (exception_) { std::rethrow_exception(exception_); } #endif // __cpp_exceptions return *reinterpret_cast(resultBuf_); } ~FutureImplResultMember() { #if defined(__cpp_exceptions) // We don't want to call the destructor if we never set the result. if (exception_) { return; } #endif // __cpp_exceptions reinterpret_cast(resultBuf_)->~Result(); } protected: template void runToResult(Func&& f) { #if defined(__cpp_exceptions) try { new (resultBuf_) Result(f()); } catch (...) { exception_ = std::current_exception(); } #else new (resultBuf_) Result(f()); #endif // __cpp_exceptions } template void setAsResult(T&& t) { new (resultBuf_) Result(std::forward(t)); } alignas(Result) mutable char resultBuf_[sizeof(Result)]; // Always present to ensure stable ABI layout regardless of __cpp_exceptions. std::exception_ptr exception_; }; template class FutureImplResultMember { protected: template void runToResult(Func&& f) { #if defined(__cpp_exceptions) try { result_ = &f(); } catch (...) { exception_ = std::current_exception(); } #else result_ = &f(); #endif // __cpp_exceptions } Result& result() const { #if defined(__cpp_exceptions) if (exception_) { std::rethrow_exception(exception_); } #endif // __cpp_exceptions return *result_; } void setAsResult(Result* res) { result_ = res; } Result* result_; // Always present to ensure stable ABI layout regardless of __cpp_exceptions. std::exception_ptr exception_; }; template <> class FutureImplResultMember { protected: template void runToResult(Func&& f) { #if defined(__cpp_exceptions) try { f(); } catch (...) { exception_ = std::current_exception(); } #else f(); #endif // __cpp_exceptions } void result() const { #if defined(__cpp_exceptions) if (exception_) { std::rethrow_exception(exception_); } #endif // __cpp_exceptions } void setAsResult() const {} // Always present to ensure stable ABI layout regardless of __cpp_exceptions. std::exception_ptr exception_; }; template class FutureBase; template class FutureImplBase : private FutureImplResultMember, public OnceCallable { public: enum Status { kNotStarted, kRunning, kReady }; using FutureImplResultMember::result; using FutureImplResultMember::setAsResult; using FutureImplResultMember::runToResult; bool ready() { return status_.intrusiveStatus().load(std::memory_order_acquire) == kReady; } void run() override { (void)run(kNotStarted); decRefCountMaybeDestroy(); } void wait() { if (waitCommon(true)) { return; } status_.wait(kReady); } template std::future_status waitFor(const std::chrono::duration& timeoutDuration) { if (waitCommon(allowInline_) || status_.waitFor(kReady, timeoutDuration)) { return std::future_status::ready; } return std::future_status::timeout; } template std::future_status waitUntil(const std::chrono::time_point& timeoutTime) { if (waitCommon(allowInline_) || status_.waitUntil(kReady, timeoutTime)) { return std::future_status::ready; } return std::future_status::timeout; } void incRefCount() { refCount_.fetch_add(1, std::memory_order_acquire); } void decRefCountMaybeDestroy() { DISPENSO_TSAN_ANNOTATE_HAPPENS_BEFORE(&refCount_); if (refCount_.fetch_sub(1, std::memory_order_release) == 1) { DISPENSO_TSAN_ANNOTATE_HAPPENS_AFTER(&refCount_); dealloc(); } } void setReady() { status_.intrusiveStatus().store(kReady, std::memory_order_release); refCount_.store(1, std::memory_order_release); } void setAllowInline(bool allow) { allowInline_ = allow; } void setTaskSetCounter(std::atomic* tsc) { taskSetCounter_ = tsc; } protected: bool run(int s) { while (s == kNotStarted) { if (status_.intrusiveStatus().compare_exchange_weak(s, kRunning, std::memory_order_acq_rel)) { runFunc(); status_.notify(kReady); if (taskSetCounter_) { // If we want TaskSet::wait to imply Future::is_ready(), // we need to signal that *after* setting the Future status to ready. taskSetCounter_->fetch_sub(1, std::memory_order_release); } tryExecuteThenChain(); return true; } } return false; } using ThenChainInvoke = void(void*, void*); struct ThenChain { ThenChain* next; void* impl; void* schedulable; ThenChainInvoke* invoke; ThenChain* scheduleDestroyAndGetNext() { invoke(impl, schedulable); constexpr size_t kImplSize = static_cast(nextPow2(sizeof(this))); auto* ret = this->next; deallocSmallBuffer(this); return ret; } }; void tryExecuteThenChain() { ThenChain* head = thenChain_.load(std::memory_order_acquire); // While the chain contains anything, let's try to get it and dispatch the chain. while (head) { if (thenChain_.compare_exchange_weak(head, nullptr, std::memory_order_acq_rel)) { // Managed to exchange with head, value of thenChain_ now points to null chain. // Head points to the implicit list of items to be executed. while (head) { head = head->scheduleDestroyAndGetNext(); } // At this point, the list is exhausted, so we exit the outer loop too. It would be valid // to get head again and try all over again, but it is guaranteed that if another link was // added concurrently to our dispatch, that thread will attempt to dispatch it's own chain, // so we just exit the loop instead. } } } template static void thenChainInvokeAsync(void* implv, void* schedulablev) { SomeFutureImpl* impl = reinterpret_cast(implv); Schedulable* schedulable = reinterpret_cast(schedulablev); schedulable->schedule(OnceFunction(impl, true), ForceQueuingTag()); } template static void thenChainInvoke(void* implv, void* schedulablev) { SomeFutureImpl* impl = reinterpret_cast(implv); Schedulable* schedulable = reinterpret_cast(schedulablev); schedulable->schedule(OnceFunction(impl, true)); } template void addToThenChainOrExecute(SomeFutureImpl* impl, Schedulable& sched, std::launch asyncPolicy) { if (status_.intrusiveStatus().load(std::memory_order_acquire) == kReady) { if ((asyncPolicy & std::launch::async) == std::launch::async) { sched.schedule(OnceFunction(impl, true), ForceQueuingTag()); } else { sched.schedule(OnceFunction(impl, true)); } return; } constexpr size_t kImplSize = static_cast(nextPow2(sizeof(ThenChain))); auto* buffer = allocSmallBuffer(); ThenChain* link = reinterpret_cast(buffer); link->impl = impl; using NonConstSchedulable = std::remove_const_t; NonConstSchedulable* nonConstSched = const_cast(&sched); link->schedulable = nonConstSched; if ((asyncPolicy & std::launch::async) == std::launch::async) { link->invoke = thenChainInvokeAsync; } else { link->invoke = thenChainInvoke; } link->next = thenChain_.load(std::memory_order_acquire); while (!thenChain_.compare_exchange_weak(link->next, link, std::memory_order_acq_rel)) { } // Okay, one last thing. It is possible that we added to the thenChain just after // tryExecuteThenChain was called from run(). We still need to ensure that this work is kicked // off, so just double check here, and execute if that may have happened. if (status_.intrusiveStatus().load(std::memory_order_acquire) == kReady) { tryExecuteThenChain(); } } inline bool waitCommon(bool allowInline) { int s = status_.intrusiveStatus().load(std::memory_order_acquire); return s == kReady || (allowInline && run(s)); } virtual void runFunc() = 0; virtual void dealloc() = 0; // Futures always run to completion (no cancellation), so destroyOnly() is unreachable. void destroyOnly() override { assert(false && "destroyOnly() should never be called on a FutureImpl"); } ~FutureImplBase() override = default; bool allowInline_; CompletionEventImpl status_{kNotStarted}; std::atomic refCount_{2}; std::atomic* taskSetCounter_{nullptr}; std::atomic thenChain_{nullptr}; template friend FutureImplBase* createValueFutureImplReady(std::reference_wrapper t); template friend FutureImplBase* createRefFutureImplReady(T&& t); template friend class FutureBase; }; // namespace detail template class FutureImplSmall : public FutureImplBase { public: FutureImplSmall(F&& f) { new (func_) F(std::move(f)); } protected: void runFunc() override { F* f = reinterpret_cast(func_); this->runToResult(*f); f->~F(); } void dealloc() override { this->~FutureImplSmall(); deallocSmallBuffer(this); } ~FutureImplSmall() override {} private: alignas(F) char func_[sizeof(F)]; }; template class FutureImplSmall : public FutureImplBase { protected: void runFunc() override {} void dealloc() override { this->~FutureImplSmall(); deallocSmallBuffer(this); } ~FutureImplSmall() override {} }; template class FutureImplAlloc : public FutureImplBase { public: FutureImplAlloc(F&& f) { new (func_) F(std::move(f)); } protected: void runFunc() override { F* f = reinterpret_cast(func_); this->runToResult(*f); f->~F(); } void dealloc() override { this->~FutureImplAlloc(); alignedFree(this); } ~FutureImplAlloc() override {} private: alignas(F) char func_[sizeof(F)]; }; template inline FutureImplBase* createFutureImpl(F&& f, bool allowInline, std::atomic* taskSetCounter) { using FNoRef = typename std::remove_reference::type; constexpr size_t kImplSize = static_cast(nextPow2(sizeof(FutureImplSmall<16, FNoRef, Result>))); using SmallT = FutureImplSmall; FutureImplBase* ret = new (allocSmallBuffer()) SmallT(std::forward(f)); ret->setAllowInline(allowInline); ret->setTaskSetCounter(taskSetCounter); return ret; } template inline FutureImplBase* createValueFutureImplReady(T&& t) { constexpr size_t kImplSize = static_cast(nextPow2(sizeof(FutureImplSmall<16, void, Result>))); using SmallT = FutureImplSmall; FutureImplBase* ret = new (allocSmallBuffer()) SmallT(); ret->setAsResult(std::forward(t)); ret->setReady(); return ret; } template inline FutureImplBase* createRefFutureImplReady(std::reference_wrapper x) { constexpr size_t kImplSize = static_cast(nextPow2(sizeof(FutureImplSmall<16, void, X&>))); using SmallT = FutureImplSmall; FutureImplBase* ret = new (allocSmallBuffer()) SmallT(); ret->setAsResult(&x.get()); ret->setReady(); return ret; } inline FutureImplBase* createVoidFutureImplReady() { constexpr size_t kImplSize = static_cast(nextPow2(sizeof(FutureImplSmall<16, void, void>))); using SmallT = FutureImplSmall; FutureImplBase* ret = new (allocSmallBuffer()) SmallT(); ret->setReady(); return ret; } template struct TaskSetInterceptionInvoker { TaskSetInterceptionInvoker(TaskSetType& ts) : taskSet(ts) {} void schedule(OnceFunction f) { savedOffFn = std::move(f); } void schedule(OnceFunction f, ForceQueuingTag) { savedOffFn = std::move(f); } TaskSetType& taskSet; OnceFunction savedOffFn; }; template class FutureBase { protected: FutureBase() noexcept : impl_(nullptr) {} FutureBase(FutureBase&& f) noexcept : impl_(f.impl_) { f.impl_ = nullptr; } FutureBase(const FutureBase& f) noexcept { impl_ = f.impl_; if (impl_) { impl_->incRefCount(); } } template FutureBase(F&& f, Schedulable& schedulable, std::launch asyncPolicy, std::launch deferredPolicy) : impl_( createFutureImpl( std::forward(f), (deferredPolicy & std::launch::deferred) == std::launch::deferred, nullptr)) { if ((asyncPolicy & std::launch::async) == std::launch::async) { schedulable.schedule(OnceFunction(impl_, true), ForceQueuingTag()); } else { schedulable.schedule(OnceFunction(impl_, true)); } } template FutureBase(F&& f, TaskSet& taskSet, std::launch asyncPolicy, std::launch deferredPolicy) : impl_( createFutureImpl( std::forward(f), (deferredPolicy & std::launch::deferred) == std::launch::deferred, &taskSet.outstandingTaskCount_)) { taskSet.outstandingTaskCount_.fetch_add(1, std::memory_order_acquire); if ((asyncPolicy & std::launch::async) == std::launch::async) { taskSet.pool().schedule(OnceFunction(impl_, true), ForceQueuingTag()); } else { taskSet.pool().schedule(OnceFunction(impl_, true)); } } template FutureBase(F&& f, ConcurrentTaskSet& taskSet, std::launch asyncPolicy, std::launch deferredPolicy) : impl_( createFutureImpl( std::forward(f), (deferredPolicy & std::launch::deferred) == std::launch::deferred, &taskSet.outstandingTaskCount_)) { taskSet.outstandingTaskCount_.fetch_add(1, std::memory_order_acquire); if ((asyncPolicy & std::launch::async) == std::launch::async) { taskSet.pool().schedule(OnceFunction(impl_, true), ForceQueuingTag()); } else { taskSet.pool().schedule(OnceFunction(impl_, true)); } } template FutureBase( F&& f, TaskSetInterceptionInvoker& invoker, std::launch asyncPolicy, std::launch deferredPolicy) : impl_( createFutureImpl( std::forward(f), (deferredPolicy & std::launch::deferred) == std::launch::deferred, &invoker.taskSet.outstandingTaskCount_)) { invoker.taskSet.outstandingTaskCount_.fetch_add(1, std::memory_order_acquire); if ((asyncPolicy & std::launch::async) == std::launch::async) { invoker.schedule(OnceFunction(impl_, true), ForceQueuingTag()); } else { invoker.schedule(OnceFunction(impl_, true)); } } void move(FutureBase&& f) noexcept { if (impl_ == f.impl_) { return; } else if (impl_) { impl_->decRefCountMaybeDestroy(); } impl_ = f.impl_; f.impl_ = nullptr; } void copy(const FutureBase& f) { if (impl_ != f.impl_) { if (impl_ != nullptr) { impl_->decRefCountMaybeDestroy(); } impl_ = f.impl_; if (impl_) { impl_->incRefCount(); } } } ~FutureBase() { if (impl_) { impl_->decRefCountMaybeDestroy(); } } bool valid() const noexcept { return impl_; } bool is_ready() const { assertValid(); return impl_->ready(); } void wait() const { assertValid(); impl_->wait(); } template std::future_status wait_for(const std::chrono::duration& timeoutDuration) const { assertValid(); return impl_->waitFor(timeoutDuration); } template std::future_status wait_until(const std::chrono::time_point& timeoutTime) const { assertValid(); return impl_->waitUntil(timeoutTime); } template FutureImplBase* thenImpl(F&& f, Schedulable& sched, std::launch asyncPolicy, std::launch deferredPolicy); template FutureImplBase* thenImpl(F&& f, TaskSet& sched, std::launch asyncPolicy, std::launch deferredPolicy); template FutureImplBase* thenImpl(F&& f, ConcurrentTaskSet& sched, std::launch asyncPolicy, std::launch deferredPolicy); #if defined DISPENSO_DEBUG void assertValid() const { assert(valid()); } #else void assertValid() const {} #endif // DISPENSO_DEBUG mutable FutureImplBase* impl_; }; struct ReadyTag {}; } // namespace detail } // namespace dispenso ================================================ FILE: dispenso/detail/future_impl2.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ namespace dispenso { namespace detail { // Implementation note regarding then(): // // At some point we tried to add Future::then()&&. The impetus was that we figured we could avoid // making an extra copy of the *this, avoiding primarily an atomic reference increment and // decrement. This lead to a 15% speedup in benchmarks. The problem was that we were retaining a // copy of the bare pointer impl_ in order to call addToThenChainOrExecute. This could lead to a // case where our impl_ only had one reference going into addToThenChainOrExecute, and internal to // that, the func is scheduled, which creates a race between the execution and potential cleanup of // the future internals, against completion of addToThenChainOrExecute. // // In reality this leads to *requiring* a bump of the reference count prior to // addToThenChainOrExecute, and a decrement afterwards. This isn't at all cheaper (in benchmarks) // than just making the copy. The moral of the story? Don't bother with Future::then()&&. template template FutureImplBase* FutureBase::thenImpl( F&& f, Schedulable& sched, std::launch asyncPolicy, std::launch deferredPolicy) { Future copy(*this); auto func = [f = std::move(f), copy = std::move(copy)]() mutable -> RetResult { copy.wait(); return f(std::move(copy)); }; auto* retImpl = createFutureImpl( std::move(func), (deferredPolicy & std::launch::deferred) == std::launch::deferred, nullptr); impl_->addToThenChainOrExecute(retImpl, sched, asyncPolicy); return retImpl; } template template FutureImplBase* FutureBase::thenImpl( F&& f, TaskSet& sched, std::launch asyncPolicy, std::launch deferredPolicy) { Future copy(*this); auto func = [f = std::move(f), copy = std::move(copy)]() mutable -> RetResult { copy.wait(); return f(std::move(copy)); }; sched.outstandingTaskCount_.fetch_add(1, std::memory_order_acquire); auto* retImpl = createFutureImpl( std::move(func), (deferredPolicy & std::launch::deferred) == std::launch::deferred, &sched.outstandingTaskCount_); impl_->addToThenChainOrExecute(retImpl, sched.pool(), asyncPolicy); return retImpl; } template template FutureImplBase* FutureBase::thenImpl( F&& f, ConcurrentTaskSet& sched, std::launch asyncPolicy, std::launch deferredPolicy) { Future copy(*this); auto func = [f = std::move(f), copy = std::move(copy)]() mutable -> RetResult { copy.wait(); return f(std::move(copy)); }; sched.outstandingTaskCount_.fetch_add(1, std::memory_order_acquire); auto* retImpl = createFutureImpl( std::move(func), (deferredPolicy & std::launch::deferred) == std::launch::deferred, &sched.outstandingTaskCount_); impl_->addToThenChainOrExecute(retImpl, sched.pool(), asyncPolicy); return retImpl; } template struct ForEachApply { template void operator()(std::tuple& t, F f) { if (f(std::get(t))) { ForEachApply{}(t, f); } } }; template struct ForEachApply { template void operator()(std::tuple& t, F f) { f(std::get<0>(t)); } }; template void forEach(std::tuple& t, F f) { constexpr size_t size = std::tuple_size>::value; ForEachApply{}(t, f); } template struct WhenAllSharedVec { VecType vec; std::atomic count; OnceFunction f; template ::value_type> WhenAllSharedVec(InputIt first, InputIt last) : vec(first, last), count(vec.size()) {} }; template struct WhenAllSharedTuple { Tuple tuple; std::atomic count; OnceFunction f; template WhenAllSharedTuple(Types&&... args) : tuple(std::make_tuple(std::forward(args)...)), count(std::tuple_size::value) {} }; struct InterceptionInvoker { void schedule(OnceFunction f) { savedOffFn = std::move(f); } void schedule(OnceFunction f, ForceQueuingTag) { savedOffFn = std::move(f); } OnceFunction savedOffFn; }; template auto whenAllTuple(Invoker& invoker, Futures&&... futures) -> Future...>> { using TupleType = std::tuple...>; using ResultFuture = Future; // TODO(bbudge): Can write something faster than make_shared using SmallBufferAllocator. auto shared = std::make_shared>(std::forward(futures)...); auto whenComplete = [shared]() -> TupleType { forEach(shared->tuple, [&shared](auto& future) { if (0 == shared->count.load(std::memory_order_acquire)) { return false; } future.wait(); return true; }); return std::move(shared->tuple); }; ResultFuture res(std::move(whenComplete), invoker); shared->f = std::move(invoker.savedOffFn); // Avoid sequencing issue by getting the reference prior to the std::move. auto& tuple = shared->tuple; forEach(tuple, [shared = std::move(shared)](auto& future) { future.then( [shared](auto&&) { if (shared->count.fetch_sub(1, std::memory_order_release) == 1) { shared->f(); } }, kImmediateInvoker); return true; }); return res; } template Future::value_type>> whenAllIterators(Invoker& invoker, InputIt first, InputIt last) { using VecType = std::vector::value_type>; using ResultFuture = Future; if (first == last) { return make_ready_future(VecType()); } // TODO(bbudge): Can write something faster than make_shared using SmallBufferAllocator. auto shared = std::make_shared>(first, last); auto whenComplete = [shared]() -> VecType { for (auto& f : shared->vec) { if (0 == shared->count.load(std::memory_order_acquire)) { break; } f.wait(); } return std::move(shared->vec); }; ResultFuture res(std::move(whenComplete), invoker); shared->f = std::move(invoker.savedOffFn); for (auto& s : shared->vec) { s.then( [shared](auto&&) { if (shared->count.fetch_sub(1, std::memory_order_release) == 1) { shared->f(); } }, kImmediateInvoker); } return res; } } // namespace detail template Future::value_type>> when_all( InputIt first, InputIt last) { detail::InterceptionInvoker interceptor; return whenAllIterators(interceptor, first, last); } template Future::value_type>> when_all(TaskSet& taskSet, InputIt first, InputIt last) { detail::TaskSetInterceptionInvoker interceptor(taskSet); return whenAllIterators(interceptor, first, last); } template Future::value_type>> when_all(ConcurrentTaskSet& taskSet, InputIt first, InputIt last) { detail::TaskSetInterceptionInvoker interceptor(taskSet); return whenAllIterators(interceptor, first, last); } inline auto when_all() -> Future> { return make_ready_future(std::tuple<>()); } inline auto when_all(TaskSet&) -> Future> { return make_ready_future(std::tuple<>()); } inline auto when_all(ConcurrentTaskSet&) -> Future> { return make_ready_future(std::tuple<>()); } template auto when_all(Futures&&... futures) -> Future...>> { detail::InterceptionInvoker interceptor; return whenAllTuple(interceptor, std::forward(futures)...); } template auto when_all(TaskSet& taskSet, Futures&&... futures) -> Future...>> { detail::TaskSetInterceptionInvoker interceptor(taskSet); return whenAllTuple(interceptor, std::forward(futures)...); } template auto when_all(ConcurrentTaskSet& taskSet, Futures&&... futures) -> Future...>> { detail::TaskSetInterceptionInvoker interceptor(taskSet); return whenAllTuple(interceptor, std::forward(futures)...); } } // namespace dispenso ================================================ FILE: dispenso/detail/graph_executor_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include namespace detail { // Maximum recursion depth for inline graph node scheduling before forcing work // through the thread pool queue. Prevents unbounded stack growth when CTS inlines // evaluateNodeConcurrently calls for additional ready dependents. static constexpr int kMaxGraphInlineDepth = 32; class ExecutorBase { protected: inline static bool hasNoIncompletePredecessors(const dispenso::Node& node) { return node.numIncompletePredecessors_.load(std::memory_order_relaxed) == 0; } inline static void addIncompletePredecessor(const dispenso::Node& node) { if (node.isCompleted()) { node.numIncompletePredecessors_.store(1, std::memory_order_relaxed); } else { node.numIncompletePredecessors_.fetch_add(1, std::memory_order_relaxed); } } inline static void ifIncompleteAddIncompletePredecessor(const dispenso::Node& node) { if (!node.isCompleted()) { node.numIncompletePredecessors_.fetch_add(1, std::memory_order_relaxed); } } inline static bool decNumIncompletePredecessors( const dispenso::Node& node, std::memory_order order) { return node.numIncompletePredecessors_.fetch_sub(1, order) == 1; } inline static bool decNumIncompletePredecessors( const dispenso::BiPropNode& node, std::memory_order order) { const std::memory_order loadOrder = order == std::memory_order_relaxed ? std::memory_order_relaxed : std::memory_order_acquire; if (node.numIncompletePredecessors_.load(loadOrder) == dispenso::Node::kCompleted) { return false; } return node.numIncompletePredecessors_.fetch_sub(1, order) == 1; } template inline static void evaluateNodeConcurrently(dispenso::ConcurrentTaskSet& tasks, const N* node) { // Track recursion depth to prevent stack overflow. CTS may inline scheduled // lambdas when load is high, causing evaluateNodeConcurrently to recurse // through the schedule→inline→evaluate path. When depth exceeds the limit, // force work through the pool queue to bound stack growth. static DISPENSO_THREAD_LOCAL int depth = 0; struct DepthGuard { DISPENSO_INLINE DepthGuard(int& d) : d_(d) { ++d_; } DISPENSO_INLINE ~DepthGuard() { --d_; } int& d_; }; DepthGuard dGuard(depth); // Process nodes in a loop, continuing inline with first ready dependent // to avoid task scheduling overhead on the critical path while (node != nullptr) { node->run(); const N* inlineNext = nullptr; for (const dispenso::Node* const d : node->dependents_) { if (decNumIncompletePredecessors(static_cast(*d), std::memory_order_acq_rel)) { const N* dep = static_cast(d); if (inlineNext == nullptr) { // First ready dependent: continue with it inline inlineNext = dep; } else if (depth < kMaxGraphInlineDepth) { // Additional ready dependents: schedule to task queue tasks.schedule([&tasks, dep]() { evaluateNodeConcurrently(tasks, dep); }); } else { // Depth limit reached: force enqueue to prevent stack overflow tasks.schedule( [&tasks, dep]() { evaluateNodeConcurrently(tasks, dep); }, dispenso::ForceQueuingTag()); } } } node = inlineNext; } } static void appendGroup( const dispenso::Node* /* node */, std::unordered_set*>& /* groups */) {} static void appendGroup( const dispenso::BiPropNode* node, std::unordered_set*>& groups) { const std::vector* group = node->biPropSet_.get(); if (group != nullptr) { groups.insert(group); } } }; } // namespace detail ================================================ FILE: dispenso/detail/math.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #if defined(_WIN32) #include #endif //_WIN32 namespace dispenso { namespace detail { constexpr uint64_t nextPow2(uint64_t v) { // https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 v--; v |= v >> 1; v |= v >> 2; v |= v >> 4; v |= v >> 8; v |= v >> 16; v |= v >> 32; v++; return v; } constexpr inline uint32_t log2const(uint64_t v) { constexpr uint64_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000, 0xFFFFFFFF00000000UL}; constexpr uint32_t S[] = {1, 2, 4, 8, 16, 32}; uint32_t r = 0; for (uint32_t i = 6; i--;) { if (v & b[i]) { v >>= S[i]; r |= S[i]; } } return r; } constexpr inline uint32_t log2const(uint32_t v) { constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000}; constexpr uint32_t S[] = {1, 2, 4, 8, 16}; uint32_t r = 0; for (uint32_t i = 5; i--;) { if (v & b[i]) { v >>= S[i]; r |= S[i]; } } return r; } // On some platforms (e.g. macOS ARM64), size_t is 'unsigned long' which is a distinct type // from both uint32_t and uint64_t, causing overload ambiguity. This template resolves it // by delegating to the 64-bit overload; it is SFINAE-disabled on platforms where unsigned // long already matches one of the fixed-width types (Linux, Windows), so no behavior change. template < typename T = unsigned long, typename std::enable_if< !std::is_same::value && !std::is_same::value, int>::type = 0> constexpr inline uint32_t log2const(T v) { return log2const(static_cast(v)); } // --- 64-bit log2 --- #if (defined(__GNUC__) || defined(__clang__)) && defined(__x86_64__) inline uint32_t log2(uint64_t v) { uint64_t result; __asm__("bsrq %1, %0" : "=r"(result) : "r"(v)); return static_cast(result); } #elif (defined(__GNUC__) || defined(__clang__)) inline uint32_t log2(uint64_t v) { return static_cast(63 - __builtin_clzll(v)); } #elif defined(_WIN64) inline uint32_t log2(uint64_t v) { unsigned long index; _BitScanReverse64(&index, v); return static_cast(index); } #elif defined(_WIN32) inline uint32_t log2(uint64_t v) { unsigned long index; uint32_t hi = static_cast(v >> 32); if (hi != 0) { _BitScanReverse(&index, hi); return static_cast(index + 32); } _BitScanReverse(&index, static_cast(v)); return static_cast(index); } #else inline uint32_t log2(uint64_t v) { return log2const(v); } #endif // 64-bit template < typename T = unsigned long, typename std::enable_if< !std::is_same::value && !std::is_same::value, int>::type = 0> inline uint32_t log2(T v) { return log2(static_cast(v)); } // --- 32-bit log2 --- #if (defined(__GNUC__) || defined(__clang__)) && (defined(__x86_64__) || defined(__i386__)) inline uint32_t log2(uint32_t v) { uint32_t result; __asm__("bsrl %1, %0" : "=r"(result) : "r"(v)); return result; } #elif (defined(__GNUC__) || defined(__clang__)) inline uint32_t log2(uint32_t v) { return static_cast(31 - __builtin_clz(v)); } #elif defined(_WIN32) inline uint32_t log2(uint32_t v) { unsigned long index; _BitScanReverse(&index, v); return static_cast(index); } #else inline uint32_t log2(uint32_t v) { return log2const(v); } #endif // 32-bit } // namespace detail } // namespace dispenso ================================================ FILE: dispenso/detail/notifier_common.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #pragma once // For fallback path #include #include #if defined(__linux__) #include #include #include #include namespace dispenso { namespace detail { static int futex( int* uaddr, int futex_op, int val, const struct timespec* timeout, int* /*uaddr2*/, int val3) { return static_cast(syscall(SYS_futex, uaddr, futex_op, val, timeout, uaddr, val3)); } } // namespace detail } // namespace dispenso #elif defined(__MACH__) #include #include #include // Detect os_sync_wait_on_address availability (macOS 14.4+) #if defined(__has_include) #if __has_include() #if defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && __MAC_OS_X_VERSION_MIN_REQUIRED >= 140400 #define DISPENSO_HAS_OS_SYNC 1 #include #endif #endif #endif #ifndef DISPENSO_HAS_OS_SYNC // __ulock APIs are available since macOS 10.12 (Sierra). On older versions (e.g. PPC/10.5 builds // for MacPorts), fall through to the std::mutex fallback. #if !defined(__MAC_OS_X_VERSION_MIN_REQUIRED) || __MAC_OS_X_VERSION_MIN_REQUIRED >= 101200 #define DISPENSO_HAS_ULOCK 1 extern "C" int __ulock_wait(uint32_t operation, void* addr, uint64_t value, uint32_t timeout_us); extern "C" int __ulock_wake(uint32_t operation, void* addr, uint64_t value); #ifndef UL_COMPARE_AND_WAIT #define UL_COMPARE_AND_WAIT 1 #endif #ifndef ULF_WAKE_ALL #define ULF_WAKE_ALL 0x00000100 #endif #endif // macOS >= 10.12 #endif // DISPENSO_HAS_OS_SYNC // DISPENSO_HAS_MAC_FUTEX is set when any mac futex-like API is available. #if defined(DISPENSO_HAS_OS_SYNC) || defined(DISPENSO_HAS_ULOCK) #define DISPENSO_HAS_MAC_FUTEX 1 #endif #ifdef DISPENSO_HAS_MAC_FUTEX namespace dispenso { namespace detail { inline void mac_futex_wait(void* addr, uint64_t expected, size_t size) { #ifdef DISPENSO_HAS_OS_SYNC os_sync_wait_on_address(addr, expected, size, OS_SYNC_WAIT_ON_ADDRESS_NONE); #else (void)size; __ulock_wait(UL_COMPARE_AND_WAIT, addr, expected, 0); #endif } inline int mac_futex_wait_for(void* addr, uint64_t expected, size_t size, uint64_t relTimeUs) { #ifdef DISPENSO_HAS_OS_SYNC static mach_timebase_info_data_t sTimebaseInfo = []() { mach_timebase_info_data_t i; mach_timebase_info(&i); return i; }(); uint64_t ns = relTimeUs * 1000; uint64_t timeout = ns * sTimebaseInfo.denom / sTimebaseInfo.numer; return os_sync_wait_on_address_with_timeout( addr, expected, size, OS_SYNC_WAIT_ON_ADDRESS_NONE, OS_CLOCK_MACH_ABSOLUTE_TIME, timeout); #else (void)size; // __ulock_wait takes a uint32_t timeout in microseconds, which wraps at ~4295 seconds (~72 min). // Timeouts beyond that cause a spurious early wake, which is harmless since callers re-check // status in a loop. return __ulock_wait(UL_COMPARE_AND_WAIT, addr, expected, static_cast(relTimeUs)); #endif } inline void mac_futex_wake_one(void* addr, size_t size) { #ifdef DISPENSO_HAS_OS_SYNC os_sync_wake_by_address_any(addr, size, OS_SYNC_WAKE_BY_ADDRESS_NONE); #else (void)size; __ulock_wake(UL_COMPARE_AND_WAIT, addr, 0); #endif } inline void mac_futex_wake_all(void* addr, size_t size) { #ifdef DISPENSO_HAS_OS_SYNC os_sync_wake_by_address_all(addr, size, OS_SYNC_WAKE_BY_ADDRESS_NONE); #else (void)size; __ulock_wake(UL_COMPARE_AND_WAIT | ULF_WAKE_ALL, addr, 0); #endif } } // namespace detail } // namespace dispenso #endif // DISPENSO_HAS_MAC_FUTEX #elif defined(_WIN32) #if (defined(_M_ARM64) || defined(__aarch64__)) && !defined(_ARM64_) #define _ARM64_ #elif (defined(_M_ARM) || defined(__arm__)) && !defined(_ARM_) #define _ARM_ #elif (defined(_M_AMD64) || defined(__x86_64__) || defined(_WIN64)) && !defined(_AMD64_) #define _AMD64_ #elif !defined(_X86_) #define _X86_ #endif // platform #include #include namespace dispenso { namespace detail { constexpr int kErrorTimeoutWin = 0x000005B4; constexpr unsigned long kInfiniteWin = static_cast(-1); } // namespace detail } // namespace dispenso #endif // PLATFORM ================================================ FILE: dispenso/detail/once_callable_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include namespace dispenso { namespace detail { class OnceCallable { public: virtual void run() = 0; virtual void destroyOnly() = 0; virtual ~OnceCallable() = default; }; struct OnceCallableData { void* data; void (*invoke)(void*, bool run); }; template void invokeImpl(void* ptr, bool run) { F* f = static_cast(ptr); if (DISPENSO_EXPECT(run, true)) { (*f)(); } f->~F(); deallocSmallBuffer(ptr); } template inline OnceCallableData createOnceCallable(F&& f) { using FNoRef = typename std::remove_reference::type; constexpr size_t kAllocSize = static_cast(nextPow2(std::max(sizeof(FNoRef), alignof(FNoRef)))); void* buf = allocSmallBuffer(); new (buf) FNoRef(std::forward(f)); return {buf, &invokeImpl}; } inline void runOnceCallable(void* ptr, bool run) { auto* callable = static_cast(ptr); if (DISPENSO_EXPECT(run, true)) { callable->run(); } else { callable->destroyOnly(); } } } // namespace detail } // namespace dispenso ================================================ FILE: dispenso/detail/op_result.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include namespace dispenso { namespace detail { template class OpResult { public: OpResult() : ptr_(nullptr) {} template < typename U, typename = typename std::enable_if< !std::is_same::type, OpResult>::value>::type> OpResult(U&& u) : ptr_(new (buf_) T(std::forward(u))) {} OpResult(const OpResult& oth) : ptr_(oth ? new (buf_) T(*oth.ptr_) : nullptr) {} OpResult(OpResult&& oth) : ptr_(oth ? new (buf_) T(std::move(*oth.ptr_)) : nullptr) { oth.ptr_ = nullptr; } OpResult& operator=(const OpResult& oth) { if (&oth == this) { return *this; } if (ptr_) { ptr_->~T(); } if (oth) { ptr_ = new (buf_) T(*oth.ptr_); } else { ptr_ = nullptr; } return *this; } OpResult& operator=(OpResult&& oth) { if (&oth == this) { return *this; } if (ptr_) { ptr_->~T(); } if (oth) { ptr_ = new (buf_) T(std::move(*oth.ptr_)); oth.ptr_ = nullptr; } else { ptr_ = nullptr; } return *this; } ~OpResult() { if (ptr_) { ptr_->~T(); } } template T& emplace(Args&&... args) { if (ptr_) { ptr_->~T(); } ptr_ = new (buf_) T(std::forward(args)...); return *ptr_; } operator bool() const { return ptr_; } bool has_value() const { return ptr_; } T& value() { return *ptr_; } private: alignas(T) char buf_[sizeof(T)]; T* ptr_; }; } // namespace detail } // namespace dispenso ================================================ FILE: dispenso/detail/per_thread_info.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include namespace dispenso { namespace detail { namespace { DISPENSO_THREAD_LOCAL PerThreadInfo g_perThreadInfo; } PerThreadInfo& PerPoolPerThreadInfo::info() { return g_perThreadInfo; } } // namespace detail } // namespace dispenso ================================================ FILE: dispenso/detail/per_thread_info.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include namespace dispenso { namespace detail { struct alignas(kCacheLineSize) PerThreadInfo { void* pool = nullptr; void* producer = nullptr; int parForRecursionLevel = 0; }; class ParForRecursion { public: ~ParForRecursion() { --parForRecursionLevel_; } private: ParForRecursion(int& parForRecursionLevel) : parForRecursionLevel_(parForRecursionLevel) { ++parForRecursionLevel_; } int& parForRecursionLevel_; friend class PerPoolPerThreadInfo; }; class PerPoolPerThreadInfo { public: static void registerPool(void* pool, void* producer) { auto& i = info(); i.pool = pool; i.producer = producer; } static void* producer(void* pool) { auto& i = info(); return i.pool == pool ? i.producer : nullptr; } static bool isParForRecursive(void* pool) { auto& i = info(); return (!i.pool || i.pool == pool) && i.parForRecursionLevel > 0; } static bool isPoolRecursive(void* pool) { return info().pool == pool; } static ParForRecursion parForRecurse() { return ParForRecursion(info().parForRecursionLevel); } private: DISPENSO_DLL_ACCESS static PerThreadInfo& info(); }; } // namespace detail } // namespace dispenso ================================================ FILE: dispenso/detail/pipeline_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #if __cplusplus >= 201703L #include #endif // C++17 #include #include #include #include #include namespace dispenso { namespace detail { // Maximum depth for inline pipeline continuation before forcing a schedule through // the thread pool. Prevents unbounded stack growth when the pool inlines execution. static constexpr int kMaxPipelineInlineDepth = 32; class LimitGatedScheduler { public: LimitGatedScheduler(ConcurrentTaskSet& tasks, ssize_t res) : impl_(new (alignedMalloc(sizeof(Impl), alignof(Impl))) Impl(tasks, res)) {} template void schedule(F&& fPipe) { impl_->schedule(std::forward(fPipe)); } void wait() { impl_->wait(); } private: // Put the guts within a unique_ptr to enable this type to be movable. class Impl { public: Impl(ConcurrentTaskSet& tasks, ssize_t res) : tasks_(tasks), resources_(res), unlimited_(res == std::numeric_limits::max()), serial_(res == 1) {} template void schedule(F&& fPipe) { outstanding_.fetch_add(1, std::memory_order_acq_rel); // RAII guard ensures outstanding_ is decremented even if an exception propagates. // Without this, wait() would hang spinning on outstanding_ reaching zero. struct OutstandingGuard { DISPENSO_INLINE ~OutstandingGuard() { outstanding_.fetch_sub(1, std::memory_order_acq_rel); } std::atomic& outstanding_; }; if (unlimited_) { tasks_.schedule([this, fPipe = std::move(fPipe)]() mutable { OutstandingGuard oGuard{outstanding_}; if (!tasks_.hasException()) { fPipe([]() {}); } }); return; } DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_BEGIN(); queue_.enqueue([this, fPipe = std::move(fPipe)]() mutable { OutstandingGuard oGuard{outstanding_}; // RAII guard releases the resource slot if the completion callback is never // called (i.e. if the user's stage function throws before reaching it). // Disarmed on the normal path when the completion callback fires. struct ResourceGuard { DISPENSO_INLINE ~ResourceGuard() { if (armed_) { resources_.fetch_add(1, std::memory_order_acq_rel); } } DISPENSO_INLINE void disarm() { armed_ = false; } std::atomic& resources_; bool armed_{true}; }; ResourceGuard rGuard{resources_}; #if defined(__cpp_exceptions) try { #endif fPipe([this, &rGuard]() { rGuard.disarm(); OnceFunction func; DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_BEGIN(); bool deqd = queue_.try_dequeue(func); DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_END(); if (deqd) { // For serial stages (limit=1), execute continuation inline to reduce // thread pool scheduling overhead. This is safe because we're already // on a worker thread and only one item can be in the stage at a time. // Depth-limit to prevent unbounded stack growth when the pool inlines. if (serial_) { static DISPENSO_THREAD_LOCAL int depth = 0; if (depth < kMaxPipelineInlineDepth) { struct DepthGuard { DISPENSO_INLINE DepthGuard(int& d) : d_(d) { ++d_; } DISPENSO_INLINE ~DepthGuard() { --d_; } int& d_; }; DepthGuard dGuard(depth); func(); } else { tasks_.schedule(std::move(func), ForceQueuingTag()); } } else { tasks_.schedule(std::move(func)); } } else { resources_.fetch_add(1, std::memory_order_acq_rel); } }); #if defined(__cpp_exceptions) } catch (...) { tasks_.trySetCurrentException(); } #endif }); DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_END(); while (resources_.fetch_sub(1, std::memory_order_acq_rel) > 0) { OnceFunction func; DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_BEGIN(); bool deqd = queue_.try_dequeue(func); DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_END(); if (deqd) { tasks_.schedule(std::move(func)); } else { break; } } resources_.fetch_add(1, std::memory_order_acq_rel); } void wait() { if (!unlimited_) { // LIMITED path: drain items from the local queue into the // ConcurrentTaskSet, and spin until all outstanding items for this // stage have completed. We must spin on outstanding_ because in-flight // CTS tasks from the previous stage may call pipeNext_.execute() after // this drain starts, enqueuing new items into our local queue. Without // the outstanding_ check, those late-arriving items could be orphaned // if schedule()'s try_dequeue spuriously misses them. while (outstanding_.load(std::memory_order_acquire)) { if (tasks_.hasException()) { // Drain remaining queued items without executing them. OnceFunction discard; while (queue_.try_dequeue(discard)) { outstanding_.fetch_sub(1, std::memory_order_acq_rel); discard.cleanupNotRun(); } break; } OnceFunction func; DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_BEGIN(); bool deqd = queue_.try_dequeue(func); DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_END(); if (deqd) { // Spin until a resource slot is available. Check for exceptions // each iteration to avoid deadlocking when all pool threads have // finished and no one will release a resource. while (resources_.fetch_sub(1, std::memory_order_acq_rel) <= 0) { resources_.fetch_add(1, std::memory_order_acq_rel); if (tasks_.hasException()) { outstanding_.fetch_sub(1, std::memory_order_acq_rel); func.cleanupNotRun(); goto next_item; } if (!tasks_.tryExecuteNext()) { std::this_thread::yield(); } } tasks_.schedule(std::move(func)); next_item:; } else if (!tasks_.tryExecuteNext()) { std::this_thread::yield(); } } return; } // Unified drain + wait loop. We must keep checking the local queue // because items can arrive after an earlier drain pass — this happens // when a completion callback's try_dequeue races with a concurrent // enqueue: the callback sees an empty queue and releases the resource // instead of chaining, leaving the item orphaned in the queue. // Re-checking on every iteration ensures we eventually dispatch it. while (outstanding_.load(std::memory_order_acquire)) { // For the unlimited path, items are scheduled directly to CTS (not // queued locally). When an exception occurs, remaining items are // already in CTS's pool queue wrapped by packageTask — CTS::wait() // will drain them. Break out here to avoid spinning. if (tasks_.hasException()) { break; } OnceFunction func; DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_BEGIN(); bool deqd = queue_.try_dequeue(func); DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_END(); if (deqd) { // Wait for resource to become available while (resources_.fetch_sub(1, std::memory_order_acq_rel) <= 0) { resources_.fetch_add(1, std::memory_order_acq_rel); if (!tasks_.tryExecuteNext()) { std::this_thread::yield(); } } tasks_.schedule(std::move(func)); } else if (!tasks_.tryExecuteNext()) { std::this_thread::yield(); } } } private: ConcurrentTaskSet& tasks_; alignas(kCacheLineSize) std::atomic resources_; alignas(kCacheLineSize) std::atomic outstanding_{0}; moodycamel::ConcurrentQueue queue_; // Note: In benchmarks, this doesn't seem to help very much (~1%), but using it should lower // resource requirements because the queue_ never needs to instantiate memory. const bool unlimited_; const bool serial_; }; struct Deleter { void operator()(Impl* r) { r->~Impl(); alignedFree(r); } }; std::unique_ptr impl_; }; template struct Stage { Stage(F&& fIn, ssize_t limitIn) : f(std::move(fIn)), limit(limitIn) {} template auto operator()(T&& t) { return f(std::forward(t)); } auto operator()() { return f(); } F f; ssize_t limit; }; template struct StageLimits { constexpr static ssize_t limit(const T& /*t*/) { return 1; } }; template struct StageLimits> { static ssize_t limit(const Stage& t) { return std::max(ssize_t{1}, t.limit); } }; enum class StageClass { kSingleStage, kGenerator, kOpTransform, kTransform, kSink }; template struct TransformTraits { static constexpr StageClass kStageClass = StageClass::kTransform; }; #if __cplusplus >= 201703L template struct TransformTraits> { static constexpr StageClass kStageClass = StageClass::kOpTransform; }; #endif // C++17 template struct TransformTraits> { static constexpr StageClass kStageClass = StageClass::kOpTransform; }; template struct OptionalStrippedTraits { using Type = T; }; #if __cplusplus >= 201703L template struct OptionalStrippedTraits> { using Type = T; }; #endif // C++17 template struct OptionalStrippedTraits> { using Type = T; }; struct SinkPipe {}; template class Pipe; template class TransformPipe { public: template TransformPipe(ConcurrentTaskSet& tasks, StageIn&& s, PipeNext&& n) : stage_(std::forward(s)), tasks_(tasks, StageLimits::limit(stage_)), pipeNext_(std::move(n)) {} void wait() { tasks_.wait(); pipeNext_.wait(); } protected: CurStage stage_; LimitGatedScheduler tasks_; PipeNext pipeNext_; }; template class Pipe : public TransformPipe { public: template Pipe(ConcurrentTaskSet& tasks, StageIn&& s, PipeNext&& n) : TransformPipe(tasks, std::forward(s), std::move(n)) {} template void execute(Input&& input) { this->tasks_.schedule([input = std::move(input), this](auto&& stageCompleteFunc) mutable { auto&& res = this->stage_(std::move(input)); stageCompleteFunc(); this->pipeNext_.execute(res); }); } }; template class Pipe : public TransformPipe { public: template Pipe(ConcurrentTaskSet& tasks, StageIn&& s, PipeNext&& n) : TransformPipe(tasks, std::forward(s), std::move(n)) {} template void execute(Input&& input) { this->tasks_.schedule([input = std::move(input), this](auto&& stageCompleteFunc) mutable { auto op = this->stage_(std::move(input)); stageCompleteFunc(); if (op) { this->pipeNext_.execute(std::move(op.value())); } }); } }; template class Pipe { public: template Pipe(ConcurrentTaskSet& tasks, StageIn&& s, PipeNext&& n) : tasks_(tasks), stage_(std::forward(s)), pipeNext_(std::move(n)) {} void execute() { ssize_t numThreads = std::max( 1, std::min(tasks_.numPoolThreads(), StageLimits::limit(stage_))); completion_ = std::make_unique(static_cast(numThreads)); for (ssize_t i = 0; i < numThreads; ++i) { tasks_.schedule([this]() { // RAII guard ensures the completion event is signaled even if an exception // propagates out of pipeNext_.execute() (e.g. when ConcurrentTaskSet runs a // downstream stage inline and it throws). Without this, wait() would hang on // completion_->wait(0) because the count is never decremented. struct CompletionGuard { DISPENSO_INLINE ~CompletionGuard() { if (completion->intrusiveStatus().fetch_sub(1, std::memory_order_acq_rel) == 1) { completion->notify(0); } } CompletionEventImpl* completion; }; CompletionGuard cGuard{completion_.get()}; while (!tasks_.hasException()) { auto op = stage_(); if (!op) { break; } pipeNext_.execute(std::move(op.value())); } }); } } void wait() { completion_->wait(0); pipeNext_.wait(); tasks_.wait(); } private: ConcurrentTaskSet& tasks_; std::unique_ptr completion_; CurStage stage_; PipeNext pipeNext_; }; template class Pipe { public: template Pipe(ConcurrentTaskSet& tasks, StageIn&& s) : tasks_(tasks), stage_(std::forward(s)) {} void execute() { size_t numThreads = std::min(tasks_.numPoolThreads(), StageLimits::limit(stage_)); for (size_t i = 0; i < numThreads; ++i) { tasks_.schedule([this]() { while (!tasks_.hasException() && stage_()) { } }); } } void wait() { tasks_.wait(); } private: ConcurrentTaskSet& tasks_; CurStage stage_; }; template class Pipe { public: template Pipe(ConcurrentTaskSet& tasks, StageIn&& s) : stage_(std::forward(s)), tasks_(tasks, StageLimits::limit(stage_)) {} template void execute(Input&& input) { tasks_.schedule([input = std::move(input), this](auto&& stageCompleteFunc) mutable { stage_(std::move(input)); stageCompleteFunc(); }); } void wait() { tasks_.wait(); } private: CurStage stage_; LimitGatedScheduler tasks_; }; template auto makePipesHelper(ConcurrentTaskSet& tasks, Stage0&& sCur) { return Pipe(tasks, std::forward(sCur)); } template auto makePipesHelper( ConcurrentTaskSet& tasks, Stage0&& sCur, Stage1&& sNext, Stages&&... sFollowing) { using Stage0Result = ResultOf::Type>; auto pipe = makePipesHelper( tasks, std::forward(sNext), std::forward(sFollowing)...); constexpr StageClass kSc = TransformTraits::kStageClass; return Pipe(tasks, std::forward(sCur), std::move(pipe)); } template auto makePipes(ConcurrentTaskSet& tasks, Stage0&& sCur, Stage1&& sNext, Stages&&... sFollowing) { auto pipe = makePipesHelper>( tasks, std::forward(sNext), std::forward(sFollowing)...); return Pipe( tasks, std::forward(sCur), std::move(pipe)); } template auto makePipes(ConcurrentTaskSet& tasks, Stage0&& sCur) { return Pipe(tasks, std::forward(sCur)); } } // namespace detail } // namespace dispenso ================================================ FILE: dispenso/detail/quanta.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #ifdef _WIN32 #include #include #endif #include namespace dispenso { #ifdef _WIN32 namespace { struct OsQuantaSetter { OsQuantaSetter() { timeBeginPeriod(1); } ~OsQuantaSetter() { timeEndPeriod(1); } }; } // namespace #else namespace { struct OsQuantaSetter {}; } // namespace #endif // _WIN32 namespace detail { void registerFineSchedulerQuanta() { static OsQuantaSetter setter; (void)setter; } } // namespace detail } // namespace dispenso ================================================ FILE: dispenso/detail/quanta.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #pragma once namespace dispenso { namespace detail { void registerFineSchedulerQuanta(); } // namespace detail } // namespace dispenso ================================================ FILE: dispenso/detail/result_of.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include namespace dispenso { namespace detail { #if defined(__cpp_lib_is_invocable) && __cpp_lib_is_invocable >= 201703L template using ResultOf = typename std::invoke_result_t, std::decay_t...>; #else template using ResultOf = typename std::result_of::type(typename std::decay::type...)>::type; #endif // c++17 } // namespace detail } // namespace dispenso ================================================ FILE: dispenso/detail/rw_lock_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include namespace dispenso { namespace detail { class RWLockImpl { public: /** * Locks for write access * * @note It is undefined behavior to recursively lock **/ void lock(); /** * Tries to lock for write access, returns if unable to lock * * @return true if lock was acquired, false otherwise **/ bool try_lock(); /** * Unlocks write access * * @note Must already be locked by the current thread of execution, otherwise, the behavior is * undefined. **/ void unlock(); /** * Locks for read access * * @note It is undefined behavior to recursively lock **/ void lock_shared(); /** * Tries to lock for read access, returns if unable to lock * * @return true if lock was acquired, false otherwise * * @note It is undefined behavior to recursively lock **/ bool try_lock_shared(); /** * Unlocks read access * * @note Must already be locked by the current thread of execution, otherwise, the behavior is * undefined. **/ void unlock_shared(); /** * Upgrade from a reader lock to a writer lock. lock_upgrade is a power-user interface. There is * a very good reason why it is not exposed as upgrade_mutex in the standard. To use it safely, * you *MUST* ensure only one thread can try to lock for write concurrently. If that cannot be * guaranteed, you should unlock for read, and lock for write instead of using lock_upgrade to * avoid potential deadlock. * * @note Calling this if the writer lock is already held, or if no reader lock is already held is * undefined behavior. **/ void lock_upgrade(); /** * Downgrade the lock from a writer lock to a reader lock. * * @note Calling this if the writer lock is not held results in undefined behavior **/ void lock_downgrade(); private: static constexpr uint32_t kWriteBit = 0x80000000; static constexpr uint32_t kReaderBits = 0x7fffffff; std::atomic lock_{0}; }; inline void RWLockImpl::lock() { uint32_t val = lock_.fetch_or(kWriteBit, std::memory_order_acq_rel); while (val & kWriteBit) { val = lock_.fetch_or(kWriteBit, std::memory_order_acq_rel); } // We've claimed single write ownership now. We need to drain off readers while (val != kWriteBit) { val = lock_.load(std::memory_order_acquire); } } inline bool RWLockImpl::try_lock() { uint32_t val = lock_.fetch_or(kWriteBit, std::memory_order_acq_rel); return !(val & kWriteBit); } inline void RWLockImpl::unlock() { lock_.fetch_and(kReaderBits, std::memory_order_acq_rel); } inline void RWLockImpl::lock_shared() { uint32_t val = lock_.fetch_add(1, std::memory_order_acq_rel); while (val & kWriteBit) { val = lock_.fetch_sub(1, std::memory_order_acq_rel); while (val & kWriteBit) { val = lock_.load(std::memory_order_acquire); } val = lock_.fetch_add(1, std::memory_order_acq_rel); } } inline bool RWLockImpl::try_lock_shared() { uint32_t val = lock_.fetch_add(1, std::memory_order_acq_rel); if (val & kWriteBit) { lock_.fetch_sub(1, std::memory_order_acq_rel); return false; } return true; } inline void RWLockImpl::unlock_shared() { lock_.fetch_sub(1, std::memory_order_acq_rel); } inline void RWLockImpl::lock_upgrade() { uint32_t val = lock_.fetch_or(kWriteBit, std::memory_order_acq_rel); while (val & kWriteBit) { val = lock_.fetch_or(kWriteBit, std::memory_order_acq_rel); } // We've claimed single write ownership now. We need to drain off readers, including ourself lock_.fetch_sub(1, std::memory_order_acq_rel); while (val != kWriteBit) { val = lock_.load(std::memory_order_acquire); } } inline void RWLockImpl::lock_downgrade() { // Get reader ownership first lock_.fetch_add(1, std::memory_order_acq_rel); unlock(); } } // namespace detail } // namespace dispenso ================================================ FILE: dispenso/detail/small_buffer_allocator_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include namespace dispenso { namespace detail { struct SmallBufferGlobals { moodycamel::ConcurrentQueue centralStore; std::vector backingStore; std::atomic backingStoreLock{0}; ~SmallBufferGlobals() { for (char* b : backingStore) { alignedFree(b); } } }; template DISPENSO_DLL_ACCESS SmallBufferGlobals& getSmallBufferGlobals(); /** * A class for allocating small chunks of memory quickly. The class is built on concepts of * thread-local pools of buffers of specific sizes. It is best to limit the distinct number of * sizes of chunk sizes used in one program or there is a good chance that there may be a lot of * unused memory allocated in the system. * * SmallBufferAllocator is completely thread-safe. **/ template class SmallBufferAllocator { private: static constexpr size_t kLogFactor = log2const(kChunkSize | 1); // TODO(T88183021): Make these factors compile-time configurable. For example, the current values // can lead to megabytes of data being allocated, even if the allocator is only used for one or // two allocations. Likely we can reduce these sizes by a decent factor without affecting // benchmarks, and then reduce them even further as an option. static constexpr size_t kMallocBytes = (1 << 12) * kLogFactor; static constexpr size_t kIdealTLCacheBytes = kMallocBytes / 4; static constexpr size_t kIdealNumTLBuffers = kIdealTLCacheBytes / kChunkSize; static constexpr size_t kMaxNumTLBuffers = 2 * kIdealNumTLBuffers; static constexpr size_t kBuffersPerMalloc = kMallocBytes / kChunkSize; static_assert(kIdealNumTLBuffers > 0, "Must have a positive number of buffers to work with"); public: /** * Allocate a buffer of kChunkSize bytes. * * @return a pointer to the buffer. **/ static char* alloc() { auto bnc = buffersAndCount(); char** tlBuffers = std::get<0>(bnc); size_t& tlCount = std::get<1>(bnc); if (!tlCount) { // We only need to register (at least) once when we grab from the central store; without going // to the central store at least once (or calling dealloc first), we cannot have buffers to // return. registerCleanup(); tlCount = grabFromCentralStore(tlBuffers); } return tlBuffers[--tlCount]; } /** * Deallocate a buffer previously allocated via alloc * * @param buffer The buffer to deallocate. **/ static void dealloc(char* buffer) { // We need to register at least once for any call to dealloc, because we may dealloc on this // thread without having allocated on the same thread, and if we don't register, any memory in // the thread-local buffers will not be returned to the central store on thread destruction. auto bnc = buffersAndCount(); char** tlBuffers = std::get<0>(bnc); size_t& tlCount = std::get<1>(bnc); registerCleanup(); tlBuffers[tlCount++] = buffer; if (tlCount == kMaxNumTLBuffers) { recycleToCentralStore(tlBuffers + kIdealNumTLBuffers, kIdealNumTLBuffers); tlCount -= kIdealNumTLBuffers; } } /** * Get the approximate number of underlying bytes allocated by the allocator. This is mostly for * testing and debugging. * * @return The approximate number of bytes currently allocated. **/ static size_t bytesAllocated() { uint32_t allocId = 0; auto& globals = getSmallBufferGlobals(); auto& lock = globals.backingStoreLock; while (!lock.compare_exchange_weak(allocId, 1, std::memory_order_acquire)) { } size_t bytes = kMallocBytes * globals.backingStore.size(); lock.store(0, std::memory_order_release); return bytes; } private: struct PerThreadQueuingData { PerThreadQueuingData( moodycamel::ConcurrentQueue& cstore, std::tuple buffersAndCount) : cstore_(cstore), buffers_(std::get<0>(buffersAndCount)), count_(std::get<1>(buffersAndCount)) { DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_BEGIN(); new (ptokenBuf_) moodycamel::ProducerToken(cstore); new (ctokenBuf_) moodycamel::ConsumerToken(cstore); DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_END(); } ~PerThreadQueuingData(); void enqueue_bulk(char** buffers, size_t count) { DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_BEGIN(); cstore_.enqueue_bulk(ptoken(), buffers, count); DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_END(); } size_t try_dequeue_bulk(char** buffers, size_t count) { DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_BEGIN(); size_t actual = cstore_.try_dequeue_bulk(ctoken(), buffers, count); DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_END(); return actual; } private: moodycamel::ProducerToken& ptoken() { return *reinterpret_cast(ptokenBuf_); } moodycamel::ConsumerToken& ctoken() { return *reinterpret_cast(ctokenBuf_); } moodycamel::ConcurrentQueue& cstore_; alignas(moodycamel::ProducerToken) char ptokenBuf_[sizeof(moodycamel::ProducerToken)]; alignas(moodycamel::ConsumerToken) char ctokenBuf_[sizeof(moodycamel::ConsumerToken)]; char** buffers_; size_t& count_; }; static void registerCleanup() { // Note that this would be better/cheaper as a static thread_local member; however, there are // currently bugs in multiple compilers that prevent the destructor from being called properly // in that context. A workaround that appears to work more portably is to put this here, and // ensure registerCleanup is called for any alloc or dealloc, even though this may have a small // runtime cost. (void)getThreadQueuingData(); } static size_t grabFromCentralStore(char** buffers) { auto& queue = getThreadQueuingData(); auto& globals = getSmallBufferGlobals(); auto& lock = globals.backingStoreLock; auto& backingStore = globals.backingStore; while (true) { size_t grabbed = queue.try_dequeue_bulk(buffers, kIdealNumTLBuffers); if (grabbed) { return grabbed; } uint32_t allocId = lock.fetch_add(1, std::memory_order_acquire); if (allocId == 0) { char* buffer = reinterpret_cast(detail::alignedMalloc(kMallocBytes, kChunkSize)); backingStore.push_back(buffer); constexpr size_t kNumToPush = kBuffersPerMalloc - kIdealNumTLBuffers; char* topush[kNumToPush]; for (size_t i = 0; i < kNumToPush; ++i, buffer += kChunkSize) { topush[i] = buffer; } queue.enqueue_bulk(topush, kNumToPush); lock.store(0, std::memory_order_release); for (size_t i = 0; i < kIdealNumTLBuffers; ++i, buffer += kChunkSize) { buffers[i] = buffer; } return kIdealNumTLBuffers; } else { while (lock.load(std::memory_order_relaxed)) { std::this_thread::yield(); } } } } static void recycleToCentralStore(char** buffers, size_t numToRecycle) { getThreadQueuingData().enqueue_bulk(buffers, numToRecycle); // TODO(bbudge): consider whether we need to do any garbage collection and return memory to // the system. } private: DISPENSO_DLL_ACCESS static std::tuple buffersAndCount() { static DISPENSO_THREAD_LOCAL char* tlBuffers[kMaxNumTLBuffers]; static DISPENSO_THREAD_LOCAL size_t tlCount = 0; return {tlBuffers, tlCount}; }; DISPENSO_DLL_ACCESS static PerThreadQueuingData& getThreadQueuingData() { static thread_local PerThreadQueuingData data( getSmallBufferGlobals().centralStore, buffersAndCount()); return data; } }; } // namespace detail } // namespace dispenso ================================================ FILE: dispenso/detail/task_set_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * @file task_set.h * A file providing TaskSet and ConcurrentTaskSet. These interfaces allow the user to * submit/schedule multiple closures and then wait on them. **/ #pragma once #include namespace dispenso { class TaskSetBase; namespace detail { template class FutureBase; class LimitGatedScheduler; DISPENSO_DLL_ACCESS void pushThreadTaskSet(TaskSetBase* tasks); DISPENSO_DLL_ACCESS void popThreadTaskSet(); } // namespace detail DISPENSO_DLL_ACCESS TaskSetBase* parentTaskSet(); class TaskSetBase { public: TaskSetBase( ThreadPool& p, ParentCascadeCancel registerForParentCancel = ParentCascadeCancel::kOff, ssize_t stealingLoadMultiplier = 4) : pool_(p), taskSetLoadFactor_(stealingLoadMultiplier * p.numThreads()) { #if defined DISPENSO_DEBUG assert(stealingLoadMultiplier > 0); pool_.outstandingTaskSets_.fetch_add(1, std::memory_order_acquire); #endif parent_ = (registerForParentCancel == ParentCascadeCancel::kOn) ? parentTaskSet() : nullptr; if (parent_) { parent_->registerChild(this); if (parent_->canceled()) { canceled_.store(true, std::memory_order_release); } } } TaskSetBase(TaskSetBase&& other) = delete; TaskSetBase& operator=(TaskSetBase&& other) = delete; ssize_t numPoolThreads() const { return pool_.numThreads(); } ThreadPool& pool() { return pool_; } void cancel() { canceled_.store(true, std::memory_order_release); cancelChildren(); } bool canceled() const { return canceled_.load(std::memory_order_acquire); } /** * Check whether an exception has been captured by this task set. * When exceptions are disabled at compile time, this always returns false, * allowing the compiler to eliminate exception-related branches entirely. **/ #if defined(__cpp_exceptions) bool hasException() const { return guardException_.load(std::memory_order_acquire) != kUnset; } #else constexpr bool hasException() const { return false; } #endif ~TaskSetBase() { #if defined DISPENSO_DEBUG pool_.outstandingTaskSets_.fetch_sub(1, std::memory_order_release); #endif if (parent_) { parent_->unregisterChild(this); } } protected: template auto packageTask(F&& f) { outstandingTaskCount_.fetch_add(1, std::memory_order_acquire); return [this, f = std::move(f)]() mutable { // Skip push/pop if this TaskSet is already the current parent on this // thread. This happens with ConcurrentTaskSet self-recursion (scheduling // tasks from within tasks on the same set), which is normal and expected. // Only actual nesting of *different* TaskSets needs stack tracking. bool pushed = (parentTaskSet() != this); if (pushed) { detail::pushThreadTaskSet(this); } if (!canceled_.load(std::memory_order_acquire)) { #if defined(__cpp_exceptions) try { f(); } catch (...) { trySetCurrentException(); } #else f(); #endif // __cpp_exceptions } if (pushed) { detail::popThreadTaskSet(); } outstandingTaskCount_.fetch_sub(1, std::memory_order_release); }; } // Package a task without incrementing the counter (for bulk scheduling where // the counter is incremented once for the entire batch). template auto packageTaskNoIncrement(F&& f) { return [this, f = std::move(f)]() mutable { detail::pushThreadTaskSet(this); if (!canceled_.load(std::memory_order_acquire)) { #if defined(__cpp_exceptions) try { f(); } catch (...) { trySetCurrentException(); } #else f(); #endif // __cpp_exceptions } detail::popThreadTaskSet(); outstandingTaskCount_.fetch_sub(1, std::memory_order_release); }; } template void scheduleBulkImpl(size_t count, Generator&& gen, moodycamel::ProducerToken* token) { if (count == 0) { return; } ssize_t numPool = pool_.numThreads(); size_t chunkSize = static_cast(numPool) + static_cast(numPool) / 2; if (chunkSize < 1) { chunkSize = 1; } size_t i = 0; while (i < count) { if (canceled()) { break; } ssize_t outstanding = outstandingTaskCount_.load(std::memory_order_relaxed); ssize_t curWork = pool_.workRemaining_.load(std::memory_order_relaxed); // Mirror the two-tier inline decision from ThreadPool::schedule(): // 1. TaskSet-level: outstandingTaskCount_ exceeds our own load factor // 2. Pool recursive: tight quickLoadFactor (numThreads*1.5) when called from a pool // thread. Critical for nested patterns where inner scheduleBulk must throttle to // avoid flooding the pool queue and starving outer tasks. // 3. Pool global: loose poolLoadFactor_ (numThreads*32) for non-recursive callers if (outstanding > taskSetLoadFactor_ || (detail::PerPoolPerThreadInfo::isPoolRecursive(&pool_) && curWork > numPool + numPool / 2) || curWork > pool_.poolLoadFactor_.load(std::memory_order_relaxed)) { #if defined(__cpp_exceptions) try { gen(i)(); } catch (...) { trySetCurrentException(); } #else gen(i)(); #endif // __cpp_exceptions ++i; } else { ssize_t room = taskSetLoadFactor_ - outstanding; size_t toEnqueue = std::min({count - i, chunkSize, static_cast(room)}); if (toEnqueue == 0) { toEnqueue = 1; } outstandingTaskCount_.fetch_add(static_cast(toEnqueue), std::memory_order_acquire); size_t base = i; pool_.scheduleBulkEnqueue( toEnqueue, [this, &gen, base](size_t j) { return packageTaskNoIncrement(gen(base + j)); }, token); i += toEnqueue; } } } DISPENSO_DLL_ACCESS void trySetCurrentException(); bool testAndResetException(); void registerChild(TaskSetBase* child) { std::lock_guard lk(mtx_); child->prev_ = tail_; child->next_ = nullptr; if (tail_) { tail_->next_ = child; tail_ = child; } else { head_ = tail_ = child; } } void unregisterChild(TaskSetBase* child) { std::lock_guard lk(mtx_); if (child->prev_) { child->prev_->next_ = child->next_; } else { // We're head assert(child == head_); head_ = child->next_; } if (child->next_) { child->next_->prev_ = child->prev_; } else { // We're tail assert(child == tail_); tail_ = child->prev_; } } void cancelChildren() { std::lock_guard lk(mtx_); auto* node = head_; while (node) { node->cancel(); node = node->next_; } } alignas(kCacheLineSize) std::atomic outstandingTaskCount_{0}; alignas(kCacheLineSize) ThreadPool& pool_; alignas(kCacheLineSize) std::atomic canceled_{false}; const ssize_t taskSetLoadFactor_; // Always present to ensure stable ABI layout regardless of __cpp_exceptions. enum ExceptionState { kUnset, kSetting, kSet }; std::atomic guardException_{kUnset}; std::exception_ptr exception_; TaskSetBase* parent_; // This mutex guards modifications/use of the intusive linked list between head_ and tail_ std::mutex mtx_; TaskSetBase* head_{nullptr}; TaskSetBase* tail_{nullptr}; // prev_ and next_ are links in our *parent's* intrusive linked list. TaskSetBase* prev_; TaskSetBase* next_; }; } // namespace dispenso ================================================ FILE: dispenso/detail/timed_task_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include namespace dispenso { namespace detail { enum FunctionFlags : uint32_t { kFFlagsNone = 0, kFFlagsDetached = 1, kFFlagsCancelled = 2 }; struct TimedTaskImpl { alignas(kCacheLineSize) std::atomic count{0}; std::atomic timesToRun; std::atomic flags{kFFlagsNone}; std::atomic inProgress{0}; double nextAbsTime; double period; bool steady; std::function)> func; template TimedTaskImpl(size_t times, double next, double per, F&& f, Schedulable& sched, bool stdy) : timesToRun(times), nextAbsTime(next), period(per), steady(stdy) { func = [&sched, f = std::move(f), this](std::shared_ptr me) { if (flags.load(std::memory_order_acquire) & kFFlagsCancelled) { return; } inProgress.fetch_add(1, std::memory_order_acq_rel); auto wrap = [&f, this, me = std::move(me)]() mutable { if (!(flags.load(std::memory_order_acquire) & kFFlagsCancelled)) { if (!f()) { timesToRun.store(0, std::memory_order_release); flags.fetch_or(kFFlagsCancelled, std::memory_order_acq_rel); func = {}; } count.fetch_add(1, std::memory_order_acq_rel); } inProgress.fetch_sub(1, std::memory_order_release); me.reset(); }; sched.schedule(wrap, ForceQueuingTag()); }; } }; } // namespace detail } // namespace dispenso ================================================ FILE: dispenso/dispenso.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * @file dispenso.h * @brief Convenience header that includes all public dispenso headers. * * For finer-grained control over compile times, include individual headers instead. **/ #pragma once // Core threading primitives #include #include #include #include // Parallel algorithms #include #include #include // Graph-based task scheduling #include #include // Concurrent containers #include #include // Synchronization primitives #include #include #include // Memory management #include #include #include // Async utilities #include #include #include // Utilities #include #include #include #include #include // Sanitizer support #include ================================================ FILE: dispenso/fast_math/README.md ================================================ # dispenso::fast_math > **EXPERIMENTAL** — This sublibrary is under active development. The API is > unstable and subject to breaking changes without notice. Do not depend on it > in production code. > > fast_math is not built by default. To include it in the CMake build, set > `-DDISPENSO_BUILD_FAST_MATH=ON`. A fast, accurate, SIMD-friendly math library for `float` transcendentals. - [Design Goals](#design-goals) - [Quick Start](#quick-start) - [Accuracy Traits](#accuracy-traits) - [SIMD Support](#simd-support) - [Function Reference](#function-reference) - [Performance](#performance) ## Design Goals - **Fast**: 1.5-6x faster than glibc scalar math, with near-linear SIMD scaling - **Accurate**: Drop-in replacements for `std::` math functions (1-4 ULP typical) - **SIMD-native**: Same template works for `float`, `__m128`, `__m256`, `__m512`, `float32x4_t`, and Highway vectors — no source changes needed - **Configurable**: `AccuracyTraits` template parameter trades speed for precision and edge-case handling at compile time ## Quick Start ```cpp #include namespace dfm = dispenso::fast_math; // Scalar — drop-in replacement for std::sinf float y = dfm::sin(x); // SSE4.1 (4-wide) — same function, SIMD type __m128 y4 = dfm::sin(x4); // AVX2 (8-wide) __m256 y8 = dfm::sin(x8); // Max accuracy with bounds checking float y = dfm::sin(x); ``` For auto-detected best SIMD width: ```cpp #include using F = dispenso::fast_math::DefaultSimdFloat; F result = dispenso::fast_math::sin(F(1.0f)); ``` ## Accuracy Traits All functions accept an optional `AccuracyTraits` template parameter: | Trait | `kMaxAccuracy` | `kBoundsValues` | Description | |:------|:-:|:-:|:---| | `DefaultAccuracyTraits` | false | false | Fastest. Correct over normal-range inputs. | | `MaxAccuracyTraits` | true | true | Highest accuracy + edge-case handling (NaN, Inf, denorm). | Custom traits can mix these independently: ```cpp struct BoundsOnly { static constexpr bool kMaxAccuracy = false; static constexpr bool kBoundsValues = true; // Handle NaN/Inf, keep fast polynomials }; float y = dfm::exp(x); ``` ## SIMD Support ### Supported Types | Backend | Type | Width | Guard Macro | |:--------|:-----|:-----:|:------------| | Scalar | `float` | 1 | always | | SSE4.1 | `__m128` | 4 | `__SSE4_1__` | | AVX2 | `__m256` | 8 | `__AVX2__` | | AVX-512 | `__m512` | 16 | `__AVX512F__` | | NEON | `float32x4_t` | 4 | `__aarch64__` | | Highway | `HwyFloat` | variable | `__has_include("hwy/highway.h")` | All SIMD backends provide `FloatTraits` specializations with the required operations (`fma`, `sqrt`, `conditional`, `bit_cast`, etc.). Include `` and the appropriate backend headers are auto-included based on platform macros. ### How It Works Every function is templated on `Flt`. When `Flt` is a SIMD vector type, all operations (arithmetic, comparisons, bit manipulation) dispatch through `FloatTraits` specializations that map to the corresponding intrinsics. Polynomial coefficients stay as scalar `float` arrays — the implicit `SseFloat(float)` / `_mm256_set1_ps` broadcast is efficient on modern hardware. ## Function Reference ### Trigonometric | Function | Signature | Domain | Max ULP (Default) | Max ULP (MaxAccuracy) | |:---------|:----------|:-------|:--:|:--:| | `sin` | `sin(x)` | all float | 1 ([-128pi,128pi]), 2 ([-1Mpi,1Mpi]) | 1 ([-1Mpi,1Mpi]), 2 (full) | | `cos` | `cos(x)` | all float | 1 ([-128pi,128pi]), 2 ([-1Mpi,1Mpi]) | 1 ([-128pi,128pi]), 2 ([-1Mpi,1Mpi]) | | `tan` | `tan(x)` | all float | 3 ([-128Kpi,128Kpi]), 32 ([-1Mpi,1Mpi]) | 3 ([-128Kpi,128Kpi]), 4 ([-1Mpi,1Mpi]) | ### Inverse Trigonometric | Function | Signature | Domain | Max ULP | |:---------|:----------|:-------|:--:| | `acos` | `acos(x)` | [-1, 1] | 3 | | `asin` | `asin(x)` | [-1, 1] | 2 | | `atan` | `atan(x)` | all float | 3 | | `atan2` | `atan2(y, x)` | all float | 3 (Default), 3 (MaxAccuracy, +Inf handling) | ### Exponential | Function | Signature | Domain | Max ULP (Default) | Max ULP (MaxAccuracy) | |:---------|:----------|:-------|:--:|:--:| | `exp` | `exp(x)` | [-89, 89] | 3 | 1 | | `exp2` | `exp2(x)` | [-127, 128] | 1 | 1 | | `exp10` | `exp10(x)` | [-40, 40] | 2 | 2 | | `expm1` | `expm1(x)` | all float | 2 | 1 | With `kBoundsValues = true`, exp/exp2/exp10 correctly handle NaN, +/-Inf, and out-of-range inputs (returning 0 for large negative, Inf for large positive). `expm1` computes exp(x) - 1 with precision near zero using Cody-Waite range reduction and Sollya-optimized polynomials. For x < -17, returns -1 exactly (exp(x) is below float ULP of 1). ### Logarithmic | Function | Signature | Domain | Max ULP (Default) | Max ULP (MaxAccuracy) | |:---------|:----------|:-------|:--:|:--:| | `log` | `log(x)` | (0, +Inf) | 2 | 2 | | `log1p` | `log1p(x)` | (-1, +Inf) | 2 | 2 | | `log2` | `log2(x)` | (0, +Inf) | 1 | 1 | | `log10` | `log10(x)` | (0, +Inf) | 3 | 3 | With `kBoundsValues = true`, log/log2/log10 correctly handle 0, denormals, Inf, and NaN inputs. `log1p` computes log(1 + x) with precision near zero using compensated addition: captures the rounding error of 1 + x and applies a first-order correction. Bounds handling inherited from log. ### Other | Function | Signature | Domain | Max ULP (Default) | Max ULP (MaxAccuracy) | |:---------|:----------|:-------|:--:|:--:| | `sqrt` | `sqrt(x)` | all float | 0 | 0 | | `cbrt` | `cbrt(x)` | all float | 12 | 3 | | `hypot` | `hypot(x, y)` | all float | 1 | 1 | | `pow` | `pow(x, y)` | all float | 1 | 1 | | `tanh` | `tanh(x)` | all float | 2 | 2 | | `erf` | `erf(x)` | all float | 2 | 2 | | `frexp` | `frexp(x, &e)` | all float | 0 | 0 | | `ldexp` | `ldexp(x, e)` | all float | 0 | 0 | `sqrt` delegates to hardware `sqrt`. `frexp` and `ldexp` are bit-accurate. `hypot` uses double-precision cast for scalar and dynamic exponent scaling with Newton refinement for SIMD (~1 ULP). Overflow-safe for all normal floats (up to ~2^126). With `kBoundsValues = true`, hypot correctly handles IEEE 754 boundary conditions: `hypot(±inf, y) = +inf` even when `y` is NaN, and `hypot(NaN, finite) = NaN`. `pow` uses double-precision arithmetic internally for ~1 ULP across all SIMD backends. Scalar uses a table-based double core; SIMD uses width-dependent dispatch (table-assisted for ≤4 lanes, fully tableless for ≥8 lanes). With `kBoundsValues = true`, handles all IEEE 754 special cases (negative base, zero, inf, NaN, subnormals). `tanh` uses `expm1(2x) / (expm1(2x) + 2)` with a Sollya-optimized polynomial fast-path for scalar |x| < 1. Saturates to ±1 for |x| > 10. `erf` uses an Abramowitz & Stegun 7.1.26-inspired t-substitution with Sollya-optimized coefficients (p=0.45). Two domains: pure polynomial for |x| < 0.875, erfc formula with inline exp(-x²) for |x| ∈ [0.875, 3.92]. NaN propagates naturally via `clamp_allow_nan` (no `kBoundsValues` overhead). Default and MaxAccuracy are identical. ## Performance Speedup ratios relative to platform libc (scalar) or scalar fast_math (SIMD). Absolute throughput varies by clock speed and microarchitecture; ratios are more meaningful for comparison. ### Linux x86-64 — AMD EPYC Genoa 166 cores, clang 21.1.7, `-O2 -march=native`, CMake Release build. #### Scalar: fast_math vs glibc | Function | Speedup | Function | Speedup | |:---------|:-:|:---------|:-:| | sin | 1.6x | exp | 3.8x | | cos | 1.6x | exp2 | 2.7x | | tan | 2.9x | exp10 | 2.5x | | acos | 2.2x | log | 3.4x | | asin | 3.4x | log2 | 3.0x | | atan | 1.7x | log10 | 3.3x | | atan2 | 3.3x | cbrt | 5.6x | | frexp | 3.8x | ldexp | 5.8x | | hypot | 2.3x | pow | 1.2x | | expm1 | 2.4x | log1p | 1.4x | | tanh | 3.0x | | | | erf | 1.6x | | | #### SIMD Scaling (per-element throughput relative to scalar fast_math) | Function | SSE (4) | AVX (8) | AVX-512 (16) | Highway (16) | |:---------|:-:|:-:|:-:|:-:| | sin | 2.7x | 5.4x | 6.5x | 5.8x | | cos | 2.6x | 5.2x | 6.5x | 5.7x | | tan | 3.6x | 7.0x | 8.7x | 7.7x | | acos | 4.9x | 9.8x | 10.1x | 10.2x | | asin | 1.9x | 3.8x | 4.3x | 4.3x | | atan | 4.8x | 9.5x | 11.9x | 11.9x | | atan2 | 3.5x | 7.0x | 9.6x | 9.6x | | exp | 3.7x | 7.4x | 7.7x | 7.3x | | exp2 | 3.7x | 7.5x | 8.8x | 7.2x | | exp10 | 3.7x | 7.3x | 8.6x | 7.1x | | log | 3.7x | 7.4x | 8.8x | 8.8x | | log2 | 3.6x | 7.3x | 9.6x | 9.6x | | log10 | 3.7x | 7.4x | 9.2x | 9.2x | | cbrt | 2.9x | 5.7x | 7.0x | 7.1x | | frexp | 7.4x | 14.8x | 23.0x | 22.9x | | ldexp | 4.8x | 9.7x | 16.6x | 16.6x | | hypot | 3.6x | — | 9.9x | — | | pow | — | 3.6x | 7.2x | — | | expm1 | 2.7x | 5.4x | 10.4x | 9.4x | | log1p | 3.0x | 6.0x | 12.5x | 12.4x | | tanh | 3.0x | 6.1x | 12.2x | 10.1x | | erf | 2.4x | 5.1x | 5.5x | 4.9x | Highway dispatches to AVX-512 (16-lane) on this hardware. #### Highway vs hwy::contrib (AVX-512 dispatch) | Function | Ratio | Function | Ratio | |:---------|:-:|:---------|:-:| | sin | 0.76x | log | 1.60x | | cos | 0.74x | log2 | 1.62x | | exp | 1.63x | log10 | 1.67x | | exp2 | 2.21x | atan | 1.12x | | acos | 1.63x | asin | 0.87x | | atan2 | 1.67x | | | fast_math is 1.6-2.2x faster for exp/log family, comparable for inverse trig, and ~25% slower for sin/cos (contrib uses a different polynomial fit). #### MaxAccuracyTraits overhead (SSE4.1) | Function | Overhead | Function | Overhead | |:---------|:-:|:---------|:-:| | sin | ~0% | log | 46% | | cos | ~0% | cbrt | 24% | | exp | 37% | hypot | 18% (SSE), 29% (AVX-512) | ### Mac ARM — Apple M4 Pro 12 cores, 48 GB, Apple clang, `-O2`. #### Scalar: fast_math vs Apple libm | Function | Speedup | Function | Speedup | |:---------|:-:|:---------|:-:| | sin | 1.0x | exp | 1.7x | | cos | 1.4x | exp2 | 1.7x | | tan | 1.2x | exp10 | 1.6x | | acos | 1.6x | log | 1.6x | | asin | 1.8x | log2 | 1.0x | | atan | 1.8x | log10 | 1.6x | | atan2 | 1.7x | cbrt | 2.6x | | frexp | 1.4x | ldexp | 2.0x | | hypot | 1.6x | | | Apple's libm is highly optimized for M-series silicon; the smaller speedups reflect a stronger baseline rather than slower fast_math. #### SIMD Scaling (per-element throughput relative to scalar fast_math) | Function | NEON (4) | Highway (4) | |:---------|:-:|:-:| | sin | 5.4x | 4.9x | | cos | 3.7x | 3.5x | | tan | 4.6x | 4.1x | | acos | 4.8x | 3.5x | | asin | 2.8x | 2.9x | | atan | 3.4x | 2.3x | | atan2 | 2.2x | 1.3x | | exp | 3.9x | 3.6x | | exp2 | 4.0x | 3.9x | | exp10 | 3.9x | 2.7x | | log | 4.4x | 2.7x | | log2 | 3.5x | 1.1x | | log10 | 2.5x | 1.6x | | cbrt | 3.1x | 2.4x | | frexp | 4.0x | 4.3x | | ldexp | 4.0x | 4.0x | | hypot | 2.1x | — | Highway dispatches to NEON (4-lane) on AArch64. NEON backend is generally faster than Highway on ARM due to lower abstraction overhead. ================================================ FILE: dispenso/fast_math/detail/double_promote.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ // DoubleVec: a pair of double-precision vectors covering all lanes of Flt. // // Provides the minimal set of operations needed for the tableless // double-precision pow core: widen, narrow, arithmetic, FMA, and // exp2 range-reduction helpers. // // Specialized for each SIMD backend alongside its float type: // float → double (scalar) // SseFloat → __m128d lo, hi // AvxFloat → __m256d lo, hi // Avx512Float → __m512d lo, hi // NeonFloat → float64x2_t lo, hi // HwyFloat → Vec> lo, hi #pragma once #include #include #include #include namespace dispenso { namespace fast_math { namespace detail { // Primary template — undefined; each backend specializes. template struct DoubleVec; // --------------------------------------------------------------------------- // Scalar: DoubleVec is just a double. // --------------------------------------------------------------------------- template <> struct DoubleVec { double v; DoubleVec() = default; explicit DoubleVec(double d) : v(d) {} static DISPENSO_INLINE DoubleVec from_float(float f) { return DoubleVec{static_cast(f)}; } // Gather: load table[index] as a double. static DISPENSO_INLINE DoubleVec gather(const double* base, int32_t index) { return DoubleVec{base[index]}; } DISPENSO_INLINE float to_float() const { return static_cast(v); } friend DISPENSO_INLINE DoubleVec operator+(DoubleVec a, DoubleVec b) { return DoubleVec{a.v + b.v}; } friend DISPENSO_INLINE DoubleVec operator-(DoubleVec a, DoubleVec b) { return DoubleVec{a.v - b.v}; } friend DISPENSO_INLINE DoubleVec operator*(DoubleVec a, DoubleVec b) { return DoubleVec{a.v * b.v}; } friend DISPENSO_INLINE DoubleVec fma(DoubleVec a, DoubleVec b, DoubleVec c) { return DoubleVec{std::fma(a.v, b.v, c.v)}; } friend DISPENSO_INLINE DoubleVec clamp(DoubleVec x, DoubleVec low, DoubleVec high) { return DoubleVec{x.v < low.v ? low.v : (x.v > high.v ? high.v : x.v)}; } }; // exp2 range reduction: free function, defined after DoubleVec is complete. // Splits x into integer n and fractional r ∈ [-0.5, 0.5]. // Returns {r, scale} where scale = 2^n. // Boundary cases (from ylogx clamped to [-1022, 1023]): // n = 1024 → scale = +inf (0x7FF0...), result overflows to +inf — correct. // n = -1023 → scale = 0.0, result underflows to 0 — correct. DISPENSO_INLINE std::pair, DoubleVec> exp2_split(DoubleVec x) { using DV = DoubleVec; constexpr double kShift = 0x1.8p+52; constexpr uint64_t kShiftBits = 0x4338000000000000ULL; double shifted = x.v + kShift; uint64_t ki = bit_cast(shifted); double nd = shifted - kShift; double r = x.v - nd; int64_t n = static_cast(ki - kShiftBits); double scale = bit_cast(static_cast(n + 1023) << 52); return {DV{r}, DV{scale}}; } // --------------------------------------------------------------------------- // SSE: DoubleVec holds two __m128d (4 float lanes → 2×2 doubles). // --------------------------------------------------------------------------- #if defined(__SSE4_1__) template <> struct DoubleVec { __m128d lo, hi; // lo = lanes 0,1; hi = lanes 2,3 DoubleVec() = default; DoubleVec(__m128d l, __m128d h) : lo(l), hi(h) {} explicit DoubleVec(double d) : lo(_mm_set1_pd(d)), hi(_mm_set1_pd(d)) {} static DISPENSO_INLINE DoubleVec from_float(SseFloat f) { return {_mm_cvtps_pd(f.v), _mm_cvtps_pd(_mm_movehl_ps(f.v, f.v))}; } // Gather: load base[idx[lane]] for each of 4 lanes via scalar lookups. // Used for small tables that fit in L1 (e.g. 16-entry pow log2 table). static DISPENSO_INLINE DoubleVec gather(const double* base, SseInt32 idx) { int32_t i0 = _mm_extract_epi32(idx.v, 0); int32_t i1 = _mm_extract_epi32(idx.v, 1); int32_t i2 = _mm_extract_epi32(idx.v, 2); int32_t i3 = _mm_extract_epi32(idx.v, 3); return {_mm_set_pd(base[i1], base[i0]), _mm_set_pd(base[i3], base[i2])}; } DISPENSO_INLINE SseFloat to_float() const { __m128 lo_f = _mm_cvtpd_ps(lo); __m128 hi_f = _mm_cvtpd_ps(hi); return SseFloat{_mm_movelh_ps(lo_f, hi_f)}; } friend DISPENSO_INLINE DoubleVec operator+(DoubleVec a, DoubleVec b) { return {_mm_add_pd(a.lo, b.lo), _mm_add_pd(a.hi, b.hi)}; } friend DISPENSO_INLINE DoubleVec operator-(DoubleVec a, DoubleVec b) { return {_mm_sub_pd(a.lo, b.lo), _mm_sub_pd(a.hi, b.hi)}; } friend DISPENSO_INLINE DoubleVec operator*(DoubleVec a, DoubleVec b) { return {_mm_mul_pd(a.lo, b.lo), _mm_mul_pd(a.hi, b.hi)}; } friend DISPENSO_INLINE DoubleVec fma(DoubleVec a, DoubleVec b, DoubleVec c) { #if defined(__FMA__) return {_mm_fmadd_pd(a.lo, b.lo, c.lo), _mm_fmadd_pd(a.hi, b.hi, c.hi)}; #else return {_mm_add_pd(_mm_mul_pd(a.lo, b.lo), c.lo), _mm_add_pd(_mm_mul_pd(a.hi, b.hi), c.hi)}; #endif } friend DISPENSO_INLINE DoubleVec clamp(DoubleVec x, DoubleVec low, DoubleVec high) { return { _mm_min_pd(_mm_max_pd(x.lo, low.lo), high.lo), _mm_min_pd(_mm_max_pd(x.hi, low.hi), high.hi)}; } }; DISPENSO_INLINE std::pair, DoubleVec> exp2_split( DoubleVec x) { using DV = DoubleVec; __m128d kShift = _mm_set1_pd(0x1.8p+52); __m128i kShiftBits = _mm_set1_epi64x(0x4338000000000000LL); __m128i k1023 = _mm_set1_epi64x(1023); __m128d shifted_lo = _mm_add_pd(x.lo, kShift); __m128d shifted_hi = _mm_add_pd(x.hi, kShift); __m128i ki_lo = _mm_castpd_si128(shifted_lo); __m128i ki_hi = _mm_castpd_si128(shifted_hi); __m128d nd_lo = _mm_sub_pd(shifted_lo, kShift); __m128d nd_hi = _mm_sub_pd(shifted_hi, kShift); __m128d r_lo = _mm_sub_pd(x.lo, nd_lo); __m128d r_hi = _mm_sub_pd(x.hi, nd_hi); __m128i n_lo = _mm_sub_epi64(ki_lo, kShiftBits); __m128i n_hi = _mm_sub_epi64(ki_hi, kShiftBits); __m128d scale_lo = _mm_castsi128_pd(_mm_slli_epi64(_mm_add_epi64(n_lo, k1023), 52)); __m128d scale_hi = _mm_castsi128_pd(_mm_slli_epi64(_mm_add_epi64(n_hi, k1023), 52)); return {DV{r_lo, r_hi}, DV{scale_lo, scale_hi}}; } #endif // __SSE4_1__ // --------------------------------------------------------------------------- // AVX: DoubleVec holds two __m256d (8 float lanes → 2×4 doubles). // --------------------------------------------------------------------------- #if defined(__AVX2__) template <> struct DoubleVec { __m256d lo, hi; // lo = lanes 0-3; hi = lanes 4-7 DoubleVec() = default; DoubleVec(__m256d l, __m256d h) : lo(l), hi(h) {} explicit DoubleVec(double d) : lo(_mm256_set1_pd(d)), hi(_mm256_set1_pd(d)) {} static DISPENSO_INLINE DoubleVec from_float(AvxFloat f) { return { _mm256_cvtps_pd(_mm256_castps256_ps128(f.v)), _mm256_cvtps_pd(_mm256_extractf128_ps(f.v, 1))}; } // Gather: AVX2 has native 4-wide double gather from int32 indices. // Two gathers cover all 8 lanes. The table is 256 bytes (4 cachelines), // likely L1-hot after the first call. static DISPENSO_INLINE DoubleVec gather(const double* base, AvxInt32 idx) { __m128i lo4 = _mm256_castsi256_si128(idx.v); __m128i hi4 = _mm256_extracti128_si256(idx.v, 1); return {_mm256_i32gather_pd(base, lo4, 8), _mm256_i32gather_pd(base, hi4, 8)}; } DISPENSO_INLINE AvxFloat to_float() const { __m128 lo_f = _mm256_cvtpd_ps(lo); __m128 hi_f = _mm256_cvtpd_ps(hi); return AvxFloat{_mm256_set_m128(hi_f, lo_f)}; } friend DISPENSO_INLINE DoubleVec operator+(DoubleVec a, DoubleVec b) { return {_mm256_add_pd(a.lo, b.lo), _mm256_add_pd(a.hi, b.hi)}; } friend DISPENSO_INLINE DoubleVec operator-(DoubleVec a, DoubleVec b) { return {_mm256_sub_pd(a.lo, b.lo), _mm256_sub_pd(a.hi, b.hi)}; } friend DISPENSO_INLINE DoubleVec operator*(DoubleVec a, DoubleVec b) { return {_mm256_mul_pd(a.lo, b.lo), _mm256_mul_pd(a.hi, b.hi)}; } friend DISPENSO_INLINE DoubleVec fma(DoubleVec a, DoubleVec b, DoubleVec c) { #if defined(__FMA__) return {_mm256_fmadd_pd(a.lo, b.lo, c.lo), _mm256_fmadd_pd(a.hi, b.hi, c.hi)}; #else return { _mm256_add_pd(_mm256_mul_pd(a.lo, b.lo), c.lo), _mm256_add_pd(_mm256_mul_pd(a.hi, b.hi), c.hi)}; #endif } friend DISPENSO_INLINE DoubleVec clamp(DoubleVec x, DoubleVec low, DoubleVec high) { return { _mm256_min_pd(_mm256_max_pd(x.lo, low.lo), high.lo), _mm256_min_pd(_mm256_max_pd(x.hi, low.hi), high.hi)}; } }; DISPENSO_INLINE std::pair, DoubleVec> exp2_split( DoubleVec x) { using DV = DoubleVec; __m256d kShift = _mm256_set1_pd(0x1.8p+52); __m256i kShiftBits = _mm256_set1_epi64x(0x4338000000000000LL); __m256i k1023 = _mm256_set1_epi64x(1023); __m256d shifted_lo = _mm256_add_pd(x.lo, kShift); __m256d shifted_hi = _mm256_add_pd(x.hi, kShift); __m256i ki_lo = _mm256_castpd_si256(shifted_lo); __m256i ki_hi = _mm256_castpd_si256(shifted_hi); __m256d nd_lo = _mm256_sub_pd(shifted_lo, kShift); __m256d nd_hi = _mm256_sub_pd(shifted_hi, kShift); __m256d r_lo = _mm256_sub_pd(x.lo, nd_lo); __m256d r_hi = _mm256_sub_pd(x.hi, nd_hi); __m256i n_lo = _mm256_sub_epi64(ki_lo, kShiftBits); __m256i n_hi = _mm256_sub_epi64(ki_hi, kShiftBits); __m256d scale_lo = _mm256_castsi256_pd(_mm256_slli_epi64(_mm256_add_epi64(n_lo, k1023), 52)); __m256d scale_hi = _mm256_castsi256_pd(_mm256_slli_epi64(_mm256_add_epi64(n_hi, k1023), 52)); return {DV{r_lo, r_hi}, DV{scale_lo, scale_hi}}; } #endif // __AVX2__ // --------------------------------------------------------------------------- // AVX-512: DoubleVec holds two __m512d (16 floats → 2×8 doubles). // --------------------------------------------------------------------------- #if defined(__AVX512F__) template <> struct DoubleVec { __m512d lo, hi; // lo = lanes 0-7; hi = lanes 8-15 DoubleVec() = default; DoubleVec(__m512d l, __m512d h) : lo(l), hi(h) {} explicit DoubleVec(double d) : lo(_mm512_set1_pd(d)), hi(_mm512_set1_pd(d)) {} static DISPENSO_INLINE DoubleVec from_float(Avx512Float f) { __m256 lo8 = _mm512_castps512_ps256(f.v); // Extract upper 8 floats without requiring AVX-512 DQ: // cast to __m512i, extract upper 256-bit int lane, reinterpret as __m256. __m256 hi8 = _mm256_castsi256_ps(_mm512_extracti64x4_epi64(_mm512_castps_si512(f.v), 1)); return {_mm512_cvtps_pd(lo8), _mm512_cvtps_pd(hi8)}; } // Gather: AVX-512 has native 8-wide double gather from int32 indices. // Two gathers cover all 16 lanes. static DISPENSO_INLINE DoubleVec gather(const double* base, Avx512Int32 idx) { __m256i lo8 = _mm512_castsi512_si256(idx.v); __m256i hi8 = _mm512_extracti64x4_epi64(idx.v, 1); return {_mm512_i32gather_pd(lo8, base, 8), _mm512_i32gather_pd(hi8, base, 8)}; } DISPENSO_INLINE Avx512Float to_float() const { __m256 lo_f = _mm512_cvtpd_ps(lo); __m256 hi_f = _mm512_cvtpd_ps(hi); // Combine without AVX-512 DQ: cast to int, insert, cast back. __m512i combined = _mm512_inserti64x4( _mm512_castsi256_si512(_mm256_castps_si256(lo_f)), _mm256_castps_si256(hi_f), 1); return Avx512Float{_mm512_castsi512_ps(combined)}; } friend DISPENSO_INLINE DoubleVec operator+(DoubleVec a, DoubleVec b) { return {_mm512_add_pd(a.lo, b.lo), _mm512_add_pd(a.hi, b.hi)}; } friend DISPENSO_INLINE DoubleVec operator-(DoubleVec a, DoubleVec b) { return {_mm512_sub_pd(a.lo, b.lo), _mm512_sub_pd(a.hi, b.hi)}; } friend DISPENSO_INLINE DoubleVec operator*(DoubleVec a, DoubleVec b) { return {_mm512_mul_pd(a.lo, b.lo), _mm512_mul_pd(a.hi, b.hi)}; } // AVX-512 always has FMA. friend DISPENSO_INLINE DoubleVec fma(DoubleVec a, DoubleVec b, DoubleVec c) { return {_mm512_fmadd_pd(a.lo, b.lo, c.lo), _mm512_fmadd_pd(a.hi, b.hi, c.hi)}; } friend DISPENSO_INLINE DoubleVec clamp(DoubleVec x, DoubleVec low, DoubleVec high) { return { _mm512_min_pd(_mm512_max_pd(x.lo, low.lo), high.lo), _mm512_min_pd(_mm512_max_pd(x.hi, low.hi), high.hi)}; } }; DISPENSO_INLINE std::pair, DoubleVec> exp2_split( DoubleVec x) { using DV = DoubleVec; __m512d kShift = _mm512_set1_pd(0x1.8p+52); __m512i kShiftBits = _mm512_set1_epi64(0x4338000000000000LL); __m512i k1023 = _mm512_set1_epi64(1023); __m512d shifted_lo = _mm512_add_pd(x.lo, kShift); __m512d shifted_hi = _mm512_add_pd(x.hi, kShift); __m512i ki_lo = _mm512_castpd_si512(shifted_lo); __m512i ki_hi = _mm512_castpd_si512(shifted_hi); __m512d nd_lo = _mm512_sub_pd(shifted_lo, kShift); __m512d nd_hi = _mm512_sub_pd(shifted_hi, kShift); __m512d r_lo = _mm512_sub_pd(x.lo, nd_lo); __m512d r_hi = _mm512_sub_pd(x.hi, nd_hi); __m512i n_lo = _mm512_sub_epi64(ki_lo, kShiftBits); __m512i n_hi = _mm512_sub_epi64(ki_hi, kShiftBits); __m512d scale_lo = _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_add_epi64(n_lo, k1023), 52)); __m512d scale_hi = _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_add_epi64(n_hi, k1023), 52)); return {DV{r_lo, r_hi}, DV{scale_lo, scale_hi}}; } #endif // __AVX512F__ // --------------------------------------------------------------------------- // NEON: DoubleVec holds two float64x2_t (4 floats → 2×2 doubles). // --------------------------------------------------------------------------- #if defined(__aarch64__) template <> struct DoubleVec { float64x2_t lo, hi; // lo = lanes 0,1; hi = lanes 2,3 DoubleVec() = default; DoubleVec(float64x2_t l, float64x2_t h) : lo(l), hi(h) {} explicit DoubleVec(double d) : lo(vdupq_n_f64(d)), hi(vdupq_n_f64(d)) {} static DISPENSO_INLINE DoubleVec from_float(NeonFloat f) { return {vcvt_f64_f32(vget_low_f32(f.v)), vcvt_f64_f32(vget_high_f32(f.v))}; } // Gather: load base[idx[lane]] for each of 4 lanes via scalar lookups. static DISPENSO_INLINE DoubleVec gather(const double* base, NeonInt32 idx) { int32_t i0 = vgetq_lane_s32(idx.v, 0); int32_t i1 = vgetq_lane_s32(idx.v, 1); int32_t i2 = vgetq_lane_s32(idx.v, 2); int32_t i3 = vgetq_lane_s32(idx.v, 3); float64x2_t lo_d = vsetq_lane_f64(base[i1], vdupq_n_f64(base[i0]), 1); float64x2_t hi_d = vsetq_lane_f64(base[i3], vdupq_n_f64(base[i2]), 1); return {lo_d, hi_d}; } DISPENSO_INLINE NeonFloat to_float() const { float32x2_t lo_f = vcvt_f32_f64(lo); float32x2_t hi_f = vcvt_f32_f64(hi); return NeonFloat{vcombine_f32(lo_f, hi_f)}; } friend DISPENSO_INLINE DoubleVec operator+(DoubleVec a, DoubleVec b) { return {vaddq_f64(a.lo, b.lo), vaddq_f64(a.hi, b.hi)}; } friend DISPENSO_INLINE DoubleVec operator-(DoubleVec a, DoubleVec b) { return {vsubq_f64(a.lo, b.lo), vsubq_f64(a.hi, b.hi)}; } friend DISPENSO_INLINE DoubleVec operator*(DoubleVec a, DoubleVec b) { return {vmulq_f64(a.lo, b.lo), vmulq_f64(a.hi, b.hi)}; } // AArch64 always has FMA. friend DISPENSO_INLINE DoubleVec fma(DoubleVec a, DoubleVec b, DoubleVec c) { return {vfmaq_f64(c.lo, a.lo, b.lo), vfmaq_f64(c.hi, a.hi, b.hi)}; } friend DISPENSO_INLINE DoubleVec clamp(DoubleVec x, DoubleVec low, DoubleVec high) { return { vminnmq_f64(vmaxnmq_f64(x.lo, low.lo), high.lo), vminnmq_f64(vmaxnmq_f64(x.hi, low.hi), high.hi)}; } }; DISPENSO_INLINE std::pair, DoubleVec> exp2_split( DoubleVec x) { using DV = DoubleVec; float64x2_t kShift = vdupq_n_f64(0x1.8p+52); float64x2_t shifted_lo = vaddq_f64(x.lo, kShift); float64x2_t shifted_hi = vaddq_f64(x.hi, kShift); uint64x2_t ki_lo = vreinterpretq_u64_f64(shifted_lo); uint64x2_t ki_hi = vreinterpretq_u64_f64(shifted_hi); float64x2_t nd_lo = vsubq_f64(shifted_lo, kShift); float64x2_t nd_hi = vsubq_f64(shifted_hi, kShift); float64x2_t r_lo = vsubq_f64(x.lo, nd_lo); float64x2_t r_hi = vsubq_f64(x.hi, nd_hi); uint64x2_t kShiftBits = vdupq_n_u64(0x4338000000000000ULL); uint64x2_t k1023 = vdupq_n_u64(1023); uint64x2_t n_lo = vsubq_u64(ki_lo, kShiftBits); uint64x2_t n_hi = vsubq_u64(ki_hi, kShiftBits); float64x2_t scale_lo = vreinterpretq_f64_u64(vshlq_n_u64(vaddq_u64(n_lo, k1023), 52)); float64x2_t scale_hi = vreinterpretq_f64_u64(vshlq_n_u64(vaddq_u64(n_hi, k1023), 52)); return {DV{r_lo, r_hi}, DV{scale_lo, scale_hi}}; } #endif // __aarch64__ // --------------------------------------------------------------------------- // Highway: DoubleVec holds two Vec>. // Width adapts to the compile-time target (SSE→2×2, AVX→2×4, AVX-512→2×8). // --------------------------------------------------------------------------- #if __has_include("hwy/highway.h") #include template <> struct DoubleVec { using DTag = hn::Repartition; using DV = hn::Vec; using ITag = hn::RebindToSigned; DV lo, hi; // lo = lower half lanes, hi = upper half lanes DoubleVec() = default; DoubleVec(DV l, DV h) : lo(l), hi(h) {} explicit DoubleVec(double d) : lo(hn::Set(DTag{}, d)), hi(hn::Set(DTag{}, d)) {} static DISPENSO_INLINE DoubleVec from_float(HwyFloat f) { const DTag dd; return {hn::PromoteLowerTo(dd, f.v), hn::PromoteUpperTo(dd, f.v)}; } // Gather: load base[idx[lane]] for each lane via scalar lookups. // Highway lacks a double-precision gather from int32 indices, so we // extract indices and load scalars. The table is small and L1-hot. static DISPENSO_INLINE DoubleVec gather(const double* base, HwyInt32 idx) { const DTag dd; constexpr size_t kMaxF = HWY_MAX_BYTES / sizeof(float); constexpr size_t kMaxD = kMaxF / 2; // half as many double lanes HWY_ALIGN int32_t ibuf[kMaxF]; hn::StoreU(idx.v, HwyInt32Tag{}, ibuf); HWY_ALIGN double lo_buf[kMaxD]; HWY_ALIGN double hi_buf[kMaxD]; const size_t nd = hn::Lanes(dd); for (size_t j = 0; j < nd; ++j) { lo_buf[j] = base[ibuf[j]]; hi_buf[j] = base[ibuf[nd + j]]; } return {hn::Load(dd, lo_buf), hn::Load(dd, hi_buf)}; } DISPENSO_INLINE HwyFloat to_float() const { const HwyFloatTag df; // DemoteTo narrows each double half to a half-width float vector. auto lo_f = hn::DemoteTo(hn::Rebind{}, lo); auto hi_f = hn::DemoteTo(hn::Rebind{}, hi); // Combine: lo_f and hi_f are each half-width Vec; concatenate. return HwyFloat{hn::Combine(df, hi_f, lo_f)}; } friend DISPENSO_INLINE DoubleVec operator+(DoubleVec a, DoubleVec b) { return {hn::Add(a.lo, b.lo), hn::Add(a.hi, b.hi)}; } friend DISPENSO_INLINE DoubleVec operator-(DoubleVec a, DoubleVec b) { return {hn::Sub(a.lo, b.lo), hn::Sub(a.hi, b.hi)}; } friend DISPENSO_INLINE DoubleVec operator*(DoubleVec a, DoubleVec b) { return {hn::Mul(a.lo, b.lo), hn::Mul(a.hi, b.hi)}; } friend DISPENSO_INLINE DoubleVec fma(DoubleVec a, DoubleVec b, DoubleVec c) { return {hn::MulAdd(a.lo, b.lo, c.lo), hn::MulAdd(a.hi, b.hi, c.hi)}; } friend DISPENSO_INLINE DoubleVec clamp(DoubleVec x, DoubleVec low, DoubleVec high) { return {hn::Min(hn::Max(x.lo, low.lo), high.lo), hn::Min(hn::Max(x.hi, low.hi), high.hi)}; } }; DISPENSO_INLINE std::pair, DoubleVec> exp2_split( DoubleVec x) { using DVH = DoubleVec; using DTag = DVH::DTag; using ITag = DVH::ITag; auto kShift = hn::Set(DTag{}, 0x1.8p+52); auto kShiftBits = hn::Set(ITag{}, 0x4338000000000000LL); auto k1023 = hn::Set(ITag{}, int64_t{1023}); auto shifted_lo = hn::Add(x.lo, kShift); auto shifted_hi = hn::Add(x.hi, kShift); auto ki_lo = hn::BitCast(ITag{}, shifted_lo); auto ki_hi = hn::BitCast(ITag{}, shifted_hi); auto nd_lo = hn::Sub(shifted_lo, kShift); auto nd_hi = hn::Sub(shifted_hi, kShift); auto r_lo = hn::Sub(x.lo, nd_lo); auto r_hi = hn::Sub(x.hi, nd_hi); auto n_lo = hn::Sub(ki_lo, kShiftBits); auto n_hi = hn::Sub(ki_hi, kShiftBits); auto scale_lo = hn::BitCast(DTag{}, hn::ShiftLeft<52>(hn::Add(n_lo, k1023))); auto scale_hi = hn::BitCast(DTag{}, hn::ShiftLeft<52>(hn::Add(n_hi, k1023))); return {DVH{r_lo, r_hi}, DVH{scale_lo, scale_hi}}; } #endif // __has_include("hwy/highway.h") } // namespace detail } // namespace fast_math } // namespace dispenso ================================================ FILE: dispenso/fast_math/detail/fast_math_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include "../util.h" namespace dispenso { namespace fast_math { namespace detail { // Constants and algorithms come from "Fast Calculation of Cube and Inverse Cube Roots Using a Magic // Constant and Its Implementation on Microcontrollers" by Moroz et al. template DISPENSO_INLINE Flt rcp_cbrt(Flt x, IntType_t i) { auto fma = FloatTraits::fma; if constexpr (AccuracyTraits::kMaxAccuracy) { // Algorithm 3 step 1 + FMA quadratic correction (2 ULP, division-free Halley). // // Step 1 uses Algorithm 3's magic constant and tuned Newton step (~10 bits). // Step 2 computes the FMA residual e = 1 - x*y^3 with single rounding (no // cancellation), then applies a Sollya-optimized quadratic correction: // y *= 1 + e*(c1 + c2*e) // This achieves cubic convergence (bits triple) without Halley's division, // at the same op count as the previous Algorithm 5 but with better accuracy. // // Polynomial: fpminimax for (1-e)^(-1/3), constant=1, on [-0.005, 0.005]. // Relative error ~1.08e-8 (~26.5 bits). constexpr float c1 = 0.3333354890346527099609375f; constexpr float c2 = 0.22222582995891571044921875f; i = 0x548c39cb - int_div_by_3(i); Flt y = bit_cast(i); y = y * fma(-0.534850249f * x * y, y * y, 1.5015480449f); Flt e = fma(x * y, -y * y, 1.0f); y = y * fma(e, fma(Flt(c2), e, Flt(c1)), 1.0f); return y; } else { // Algorithm 3 from Moroz et al. (12 ULP). i = 0x548c39cb - int_div_by_3(i); Flt y = bit_cast(i); y = y * fma(-0.534850249f * x * y, y * y, 1.5015480449f); y = y * fma(-0.3333333333f * x * y, y * y, 1.333333985f); return y; } } // TODO: We likely will never need a double precision version of this function, but note that this // is definitely busted for double. template DISPENSO_INLINE Flt frexpImpl(IntType_t hx, IntType_t* eptr) { using IntT = IntType_t; IntT ix = 0x7fffffff & hx; auto nonzero = hx != 0; *eptr = FloatTraits::conditional(nonzero, (ix >> 23) - 126, IntT(0)); return bit_cast((hx & 0x807fffff) | 0x3f000000); } template DISPENSO_INLINE Flt ldexpImpl(UintType_t hx, IntType_t e) { auto esgn = signofi(e); e *= esgn; hx += esgn * (e << 23); return bit_cast(hx); } template DISPENSO_INLINE Flt cos_pi_4(Flt x2) { // Approximate cosine on [-PI/4,+PI/4], max error 0.87444 ULP. // Polynomial in x²: c0 + c1*x² + c2*x⁴ + c3*x⁶ + c4*x⁸. return dispenso::fast_math::hornerEval( x2, 2.44677067e-5f, -1.38877297e-3f, 4.16666567e-2f, -5.00000000e-1f, 1.00000000e+0f); } template DISPENSO_INLINE Flt sin_pi_4(Flt x2, Flt x) { // Approximate sine on [-PI/4,+PI/4]. // Sollya fpminimax(sin(x) - x, [|3,5,7,9|], [|SG...|], [-pi/4;pi/4], absolute). // Odd polynomial: sin(x) = x + x³ * P(x²). Flt s = dispenso::fast_math::hornerEval( x2, 2.78547373e-6f, -1.98483889e-4f, 8.33336823e-3f, -1.66666672e-1f); return FloatTraits::fma(s, x * x2, x); } // Wider-domain sin/cos polynomials for pi-reduction (no quadrant blending needed). // sin(x) on [-pi/2, pi/2], degree 11 odd polynomial: sin(x) = x * P(x^2) // Sollya fpminimax(sin(x), [|1,3,5,7,9,11|], [|SG...|], [-pi/2;pi/2], absolute) // Absolute error: 3.2e-10 (31.5 bits) template DISPENSO_INLINE Flt sin_pi_2(Flt x2, Flt x) { // Odd polynomial: sin(x) = x + x³ * P(x²). Flt s = dispenso::fast_math::hornerEval( x2, -0x1.ab55a8p-26f, 0x1.725326p-19f, -0x1.a0205p-13f, 0x1.11112ep-7f, -0x1.555556p-3f); return FloatTraits::fma(s, x * x2, x); } template DISPENSO_INLINE Flt rangeReduceFloor(Flt& x, NonDeduced rcpMod, NonDeduced valHi, NonDeduced valLo) { auto j = floor_small(x * rcpMod); // FloatTraits::fma(x, rcpMod, kMagic) - kMagic; x = FloatTraits::fma(j, -valHi, x); x = FloatTraits::fma(j, -valLo, x); return j; } template DISPENSO_INLINE Flt rangeReduce(Flt& x, NonDeduced rcpMod, NonDeduced valHi, NonDeduced valLo) { const Flt kMagic = FloatTraits::kMagic; auto j = FloatTraits::fma(x, rcpMod, kMagic) - kMagic; x = FloatTraits::fma(j, -valHi, x); x = FloatTraits::fma(j, -valLo, x); return j; } template DISPENSO_INLINE Flt rangeReduce2( Flt& x, NonDeduced rcpMod, NonDeduced valHi, NonDeduced valMed, NonDeduced valLo) { const Flt kMagic = FloatTraits::kMagic; auto j = FloatTraits::fma(x, rcpMod, kMagic) - kMagic; x = FloatTraits::fma(j, -valHi, x); x = FloatTraits::fma(j, -valMed, x); x = FloatTraits::fma(j, -valLo, x); return j; } template DISPENSO_INLINE Flt rangeReduce3( Flt& x, NonDeduced rcpMod, NonDeduced valHi, NonDeduced valMedHi, NonDeduced valMedLo, NonDeduced valLo) { const Flt kMagic = FloatTraits::kMagic; auto j = FloatTraits::fma(x, rcpMod, kMagic) - kMagic; x = FloatTraits::fma(j, -valHi, x); x = FloatTraits::fma(j, -valMedHi, x); x = FloatTraits::fma(j, -valMedLo, x); x = FloatTraits::fma(j, -valLo, x); return j; } // https://stackoverflow.com/questions/63918873/approximating-cosine-on-0-pi-using-only-single-precision-floating-point // Cosine has quadrant bias 1, sine has quadrant bias 0 template DISPENSO_INLINE Flt sincos_pi_impl(Flt x, Flt j, int phaseShift) { /* phase shift of pi/2 (one quadrant) for cosine */ auto i = convert_to_int(j) + phaseShift; Flt r; if constexpr (std::is_same_v) { // For scalars, explicit if/else is faster to ensure // we avoid computing both functions if (i & 1) { r = cos_pi_4(x * x); } else { r = sin_pi_4(x * x, x); } } else { r = FloatTraits::conditional((i & 1) != 0, cos_pi_4(x * x), sin_pi_4(x * x, x)); } return FloatTraits::conditional((i & 2) != 0, -r, r); } // Combined sin+cos computation. Always evaluates both polynomials (needed for sincos). // For SIMD types, evaluates both polynomials and blends per-lane. template DISPENSO_INLINE void sincos_pi_impl_both(Flt x, Flt j, Flt* out_sin, Flt* out_cos) { auto i = convert_to_int(j); auto x2 = x * x; auto sv = sin_pi_4(x2, x); auto cv = cos_pi_4(x2); // For sin: quadrant i. For cos: quadrant i+1 (phase shift). auto sin_use_cos = (i & 1) != 0; auto cos_use_cos = ((i + 1) & 1) != 0; Flt sr = FloatTraits::conditional(sin_use_cos, cv, sv); Flt cr = FloatTraits::conditional(cos_use_cos, cv, sv); *out_sin = FloatTraits::conditional((i & 2) != 0, -sr, sr); *out_cos = FloatTraits::conditional(((i + 1) & 2) != 0, -cr, cr); } // Pi-reduction dispatch: each function evaluates only its own polynomial. // j = round(x / pi), so x_reduced in [-pi/2, pi/2]. // sin(x) = (-1)^j * sin(x_reduced), cos(x) = (-1)^j * cos(x_reduced). template DISPENSO_INLINE Flt sin_pi_reduction(Flt x, Flt j) { auto ji = convert_to_int(j); auto x2 = x * x; auto result = sin_pi_2(x2, x); return FloatTraits::conditional((ji & 1) != 0, -result, result); } // Cosine via offset-pi reduction (Highway-style). // q = 2*trunc(|x|/pi) + 1 maps cos zeros to x_r = 0. Then cos(x) = ±sin(x_r). // No quadrant blending: one polynomial, sign flip only. template DISPENSO_INLINE Flt cos_offset_pi_reduction(Flt x_r, IntType_t qi) { auto x2 = x_r * x_r; auto result = sin_pi_2(x2, x_r); // (q & 2) == 0 → negate (cos starts positive at q=1). return FloatTraits::conditional((qi & 2) != 0, result, -result); } template DISPENSO_INLINE Flt tan_pi_2_impl(Flt x, Flt j) { auto i = convert_to_int(j); auto x2 = x * x; auto cv = cos_pi_4(x2); auto sv = sin_pi_4(x2, x); auto flip = (i & 1) != 0; Flt r; if constexpr (std::is_same_v) { if (flip) { r = cv / sv; } else { r = sv / cv; } } else { auto n = FloatTraits::conditional(flip, cv, sv); auto d = FloatTraits::conditional(flip, sv, cv); r = n / d; } auto neg = ((i ^ (i + 1)) & 2) != 0; r = FloatTraits::conditional(neg, -r, r); return r; } template DISPENSO_INLINE Flt asin_0_pt5(Flt x) { // asin(x) = x + x * x² * P(x²), where P approximates (asin(x)-x)/x³ on [0, 0.5]. auto x2 = x * x; Flt px; if constexpr (AccuracyTraits::kMaxAccuracy || AccuracyTraits::kBoundsValues) { // Sollya fpminimax((asin(x)-x)/x^3, [|0,2,4,6,8,10|], [|SG...|], [1b-50, 0.5]): // sup-norm error < 2^-27. px = dispenso::fast_math::hornerEval( x2, 0x1.1a258cp-5f, 0x1.10abf6p-6f, 0x1.ff9494p-6f, 0x1.6d4032p-5f, 0x1.3334c8p-4f, 0x1.555554p-3f) * x2; } else { // Sollya fpminimax((asin(x)-x)/x^3, [|0,2,4,6,8|], [|SG...|], [1b-50, 0.5]): // sup-norm error < 2^-24. px = dispenso::fast_math::hornerEval( x2, 0x1.3926d2p-5f, 0x1.b1e3acp-6f, 0x1.70bf2ap-5f, 0x1.332688p-4f, 0x1.55555ep-3f) * x2; } return x + x * px; } template DISPENSO_INLINE Flt asin_pt5_1(Flt x) { // asin(x) = pi/2 + sqrt(1-x) * P(x), where P approximates (asin(x)-pi/2)/sqrt(1-x) // on [0.5, 1]. constexpr float kPi_2hi = 1.57079637050628662109375f; constexpr float kPi_2lo = -4.37113900018624283e-8f; Flt y; if constexpr (AccuracyTraits::kMaxAccuracy || AccuracyTraits::kBoundsValues) { // Sollya fpminimax((asin(x)-pi/2)/sqrt(1-x), 6, [|SG...|], [0.5, 1-1b-23]): // sup-norm error < 2^-32. y = dispenso::fast_math::hornerEval( x, -0x1.b7070cp-11f, 0x1.72345p-8f, -0x1.2f2922p-6f, 0x1.59358ep-5f, -0x1.5f9f5p-4f, 0x1.b6199ap-3f, -0x1.921b8p0f); } else { // Sollya fpminimax((asin(x)-pi/2)/sqrt(1-x), 5, [|SG...|], [0.5, 1-1b-23]): // sup-norm error < 2^-28. y = dispenso::fast_math::hornerEval( x, 0x1.edfaacp-10f, -0x1.79d524p-7f, 0x1.216da6p-5f, -0x1.507bb6p-4f, 0x1.b3ef5cp-3f, -0x1.921358p0f); } // Use compensating sum for better rounding. return kPi_2hi + FloatTraits::fma(FloatTraits::sqrt(1.0f - x), y, kPi_2lo); } template DISPENSO_INLINE std::tuple, Flt, Flt> logarithmSep(Flt& x) { using IntT = IntType_t; using UintT = UintType_t; IntT xi; Flt i; Flt m; auto fma = FloatTraits::fma; if constexpr (AccuracyTraits::kMaxAccuracy) { auto small = x < 1.175494351e-38f; x = FloatTraits::conditional(small, x * 8388608.0f, x); i = FloatTraits::conditional(small, Flt(-23.0f), Flt(0.0f)); xi = bit_cast(x); // Use unsigned arithmetic to avoid signed overflow UB on negative inputs. UintT xu = bit_cast(x); UintT e = (xu - UintT(0x3f3504f3)) & UintT(0xff800000); m = bit_cast(xu - e); i = fma(Flt(bit_cast(e)), 1.19209290e-7f, i); } else { // Ignore denorms, degradation behavior not that bad xi = bit_cast(x); UintT xu = bit_cast(x); UintT e = (xu - UintT(0x3f3504f3)) & UintT(0xff800000); m = bit_cast(xu - e); i = Flt(bit_cast(e)) * 1.19209290e-7f; } return {xi, i, m}; } template DISPENSO_INLINE Flt logarithmBounds(Flt x, Flt y, IntType_t xi) { using IntT = IntType_t; using UintT = UintType_t; // Negative-input detection uses unsigned comparison: negative float bit // patterns have the sign bit set, making them large unsigned values. Scalar // C++ promotes to unsigned implicitly, but SIMD integer comparison intrinsics // are always signed, so we use UintT explicitly for the SIMD path. // // Condition 3 (negative detection) uses > 0x80000000u (not >= ) so that -0.0f // (bit pattern 0x80000000) is excluded — IEEE 754 requires log(-0) = -inf, // which is already handled by condition 1 (x == 0.0f). if constexpr (std::is_same_v) { if (xi < 0x00800000 || xi >= 0x7f800000) { int orbits = bool_apply_or_zero(x == 0.0f, 0xff800000); orbits |= bool_apply_or_zero((xi & 0x7f800000) == 0x7f800000, xi); orbits |= bool_apply_or_zero(static_cast(xi) > 0x80000000u, 0x7f8fffff); y = orbits ? bit_cast(orbits) : y; } } else { IntT orbits = bool_apply_or_zero(x == 0.0f, 0xff800000); orbits |= bool_apply_or_zero((xi & 0x7f800000) == 0x7f800000, xi); orbits |= bool_apply_or_zero(bit_cast(xi) > UintT(0x80000000), 0x7f8fffff); // Use a proper mask (all-ones/all-zeros) for conditional, since blendv // only checks bit 31. Raw orbits values like 0x7F800000 (+inf) have bit 31 // clear and would be incorrectly treated as "false". auto hasBound = orbits != IntT(0); y = FloatTraits::conditional(hasBound, bit_cast(orbits), y); } return y; } template DISPENSO_INLINE Flt atan_poly(Flt x) { // atan(x) ≈ x * P(x), where P approximates atan(x)/x on [0, 1]. // Horner (not Estrin): Estrin regresses atan ULP from 3 to 4 and is slower // on scalar due to register pressure from the x-power tree. Flt y; if constexpr (AccuracyTraits::kMaxAccuracy || AccuracyTraits::kBoundsValues) { // Sollya fpminimax(atan(x)/x, 11, [|SG...|], [1b-50, 1]): // sup-norm error < 2^-31. y = dispenso::fast_math::hornerEval( x, -0x1.1592cap-6f, 0x1.a09c1cp-4f, -0x1.f68112p-3f, 0x1.150f26p-2f, -0x1.e2e56p-5f, -0x1.0e262ep-3f, -0x1.d8f3p-15f, 0x1.98c9aap-3f, 0x1.352e9p-14f, -0x1.5556b2p-2f, 0x1.f2bd6p-24f, 0x1p0f); } else { // Sollya fpminimax(atan(x)/x, 9, [|SG...|], [1b-50, 1]): // sup-norm error < 2^-26. y = dispenso::fast_math::hornerEval( x, 0x1.1f76cap-6f, -0x1.cc3324p-4f, 0x1.252662p-2f, -0x1.530ee2p-2f, 0x1.2ca298p-4f, 0x1.76789ap-3f, 0x1.29d66ep-9f, -0x1.557c3cp-2f, 0x1.c39d04p-19f, 0x1p0f); } return y * x; } } // namespace detail } // namespace fast_math } // namespace dispenso ================================================ FILE: dispenso/fast_math/fast_math.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * EXPERIMENTAL — This sublibrary is under active development. * The API is unstable and subject to breaking changes without notice. * Do not depend on it in production code. */ #pragma once #include namespace dispenso { namespace fast_math { struct DefaultAccuracyTraits { // Check for out-of-range, NaN, Inf inputs and ensure outputs match std. Note that functions may // still provide correct values out of bounds, but if this is false, they are not required to. static constexpr bool kBoundsValues = false; // Try to get the lowest possible errors to match std. static constexpr bool kMaxAccuracy = false; }; struct MaxAccuracyTraits { // Check for out-of-range, NaN, Inf inputs and ensure outputs match std static constexpr bool kBoundsValues = true; // Try to get the lowest possible errors to match std. static constexpr bool kMaxAccuracy = true; }; } // namespace fast_math } // namespace dispenso #include "detail/double_promote.h" #include "detail/fast_math_impl.h" namespace dispenso { namespace fast_math { namespace detail { // AccuracyTraits adapter for internal pow use: always uses best polynomial, // inherits kBoundsValues from the outer traits. template struct PowInternalTraits { static constexpr bool kMaxAccuracy = true; static constexpr bool kBoundsValues = AccuracyTraits::kBoundsValues; }; // Scalar integer parity check on float bits. // Returns 0 if y is not an integer, 1 if even integer, 2 if odd integer. // Works regardless of sign bit (sign is masked off from exponent field). // Branching version — used inside the rare negative-x path. DISPENSO_INLINE int32_t checkint(uint32_t iy) { int32_t e = static_cast(iy >> 23) & 0xff; if (e < 127) return 0; // |y| < 1 → not integer (includes 0, subnormals) if (e > 127 + 23) return 1; // |y| >= 2^24 → ULP ≥ 2, only even integers representable // 127 <= e <= 150: check mantissa bits below the integer point. uint32_t shift = static_cast(127 + 23 - e); if (iy & ((1u << shift) - 1u)) return 0; // fractional mantissa bits → not integer if (iy & (1u << shift)) return 2; // lowest integer bit set → odd return 1; // even integer } // Branchless integer parity check for the scalar non-bounds path. // Returns 0 if not integer, 1 if even integer, 2 if odd integer. // For the non-bounds default path, we want to avoid branches on the hot path. // The negative-x case is rare, but mixed-sign workloads benefit from branchless. DISPENSO_INLINE int32_t checkint_branchless(uint32_t iy) { int32_t e = static_cast(iy >> 23) & 0xff; // Clamp shift to [0, 31] to avoid UB from shifting >= 32. When e < 127 // (|y| < 1, not integer), the clamped shift reads a garbage bit, but // is_int below is false so the garbage is never used. uint32_t shift = static_cast(127 + 23 - e) & 31u; uint32_t has_frac = iy & ((1u << shift) - 1u); uint32_t odd_bit = (iy >> shift) & 1u; // is_int: e >= 127 and no fractional bits (or e > 150, always integer). // For e > 150: shift would be negative (clamped to garbage), but we force result = 1. int32_t is_int = (e >= 127) & (has_frac == 0); int32_t is_large = (e > 127 + 23); // always even // Result: 0 if not int, 1 if even int, 2 if odd int. // odd_bit is meaningful only when is_int && !is_large. return is_int + (is_int & static_cast(odd_bit) & ~is_large); } // Scalar double-precision pow core: exp2(y * log2(|x|)) entirely in double. // Float inputs/output, double internal arithmetic. // // log2: 16-entry table of {1/c, log2(c)} eliminates division. Degree-4 // polynomial (Sollya fpminimax) approximates log2(1+r)/r with pipelined // evaluation. Table centers at OFF + i*2^19 + 2^18, max |r| < 0.0297. // exp2: 32-entry table of 2^(k/32) stored as uint64 with pre-subtracted // exponent bits. Degree-3 polynomial (Sollya fpminimax) on [-1/64, 1/64]. // // Total: ~8 double muls + ~5 double adds in two short parallel chains. // Achieves <1 ULP for float output. // log2 table: 16 entries, each {1/c, log2(c)} where c is the center of // the ith subinterval of [OFF, 2*OFF] (OFF = 0x3f330000). struct PowLog2Entry { double invc; double logc; }; // EXP2F_TABLE_BITS = 5 → 32 entries. // tab[i] = uint64(2^(i/32)) - (i << (52-5)). // To reconstruct 2^(k/32): double(tab[k%32] + (k << 47)). constexpr uint64_t kPowExp2Tab[32] = { 0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51, 0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1, 0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d, 0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585, 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13, 0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d, 0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069, 0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540, }; constexpr PowLog2Entry kPowLog2Tab[16] = { {0x1.661ec6a5122f9p+0, -0x1.efec61b011f85p-2}, // i=0, c=0.71484375 {0x1.571ed3c506b3ap+0, -0x1.b0b67f4f46810p-2}, // i=1, c=0.74609375 {0x1.49539e3b2d067p+0, -0x1.7418acebbf18fp-2}, // i=2, c=0.77734375 {0x1.3c995a47babe7p+0, -0x1.39de8e1559f6fp-2}, // i=3, c=0.80859375 {0x1.30d190130d190p+0, -0x1.01d9bbcfa61d4p-2}, // i=4, c=0.83984375 {0x1.25e22708092f1p+0, -0x1.97c1cb13c7ec1p-3}, // i=5, c=0.87109375 {0x1.1bb4a4046ed29p+0, -0x1.2f9e32d5bfdd1p-3}, // i=6, c=0.90234375 {0x1.12358e75d3033p+0, -0x1.960caf9abb7cap-4}, // i=7, c=0.93359375 {0x1.0953f39010954p+0, -0x1.a6f9c377dd31bp-5}, // i=8, c=0.96484375 {0x1p+0, 0x0p+0}, // i=9, c=1.0 (exact) {0x1.e573ac901e574p-1, 0x1.3aa2fdd27f1c3p-4}, // i=10, c=1.05468750 {0x1.ca4b3055ee191p-1, 0x1.476a9f983f74dp-3}, // i=11, c=1.11718750 {0x1.b2036406c80d9p-1, 0x1.e840be74e6a4dp-3}, // i=12, c=1.17968750 {0x1.9c2d14ee4a102p-1, 0x1.406463b1b0449p-2}, // i=13, c=1.24218750 {0x1.886e5f0abb04ap-1, 0x1.88e9c72e0b226p-2}, // i=14, c=1.30468750 {0x1.767dce434a9b1p-1, 0x1.ce0a4923a587dp-2}, // i=15, c=1.36718750 }; // log2(1+r)/r polynomial coefficients (degree 4, Sollya fpminimax). // Relative error < 2^-32 on |r| < 0.0297. constexpr double kPowLog2Poly[5] = { 0x1.27bc533354f16p-2, // P[0]: r^4 coeff -0x1.719a02b5d4c08p-2, // P[1]: r^3 coeff 0x1.ec70982733e8fp-2, // P[2]: r^2 coeff -0x1.7154745c0af07p-1, // P[3]: r coeff (approx -1/(2*ln2)) 0x1.71547652bc3ep0, // P[4]: r^0 coeff (approx 1/ln2) }; // exp2 polynomial coefficients (degree 3 on [-1/64, 1/64], Sollya fpminimax). // Relative error < 2^-27 on (2^r - 1)/r. constexpr double kPowExp2Poly[3] = { 0x1.c6b08d7047f43p-5, // C[0]: r^2 coeff 0x1.ebfccc582d012p-3, // C[1]: r coeff 0x1.62e42fefe5286p-1, // C[2]: r^0 coeff (approx ln2) }; DISPENSO_INLINE float pow_double_core(float ax, float y) { constexpr uint32_t kOff = 0x3f330000; // --- log2 --- uint32_t ix = bit_cast(ax); uint32_t tmp = ix - kOff; int32_t i = (tmp >> (23 - 4)) & 15; // table index uint32_t top = tmp & 0xff800000u; uint32_t iz = ix - top; int32_t k = static_cast(top) >> 23; // exponent (arithmetic shift) double invc = kPowLog2Tab[i].invc; double logc = kPowLog2Tab[i].logc; double z = static_cast(bit_cast(iz)); // log2(x) = log1p(z/c - 1)/ln2 + log2(c) + k double r = z * invc - 1.0; double y0 = logc + static_cast(k); // Pipelined polynomial: two parallel chains merged via r4. double r2 = r * r; double p0 = kPowLog2Poly[0] * r + kPowLog2Poly[1]; double p1 = kPowLog2Poly[2] * r + kPowLog2Poly[3]; double r4 = r2 * r2; double p2 = kPowLog2Poly[4] * r + y0; p2 = p1 * r2 + p2; double logx = p0 * r4 + p2; // --- y * log2(x) --- double ylogx = static_cast(y) * logx; // Overflow/underflow check. // Check if |ylogx| >= 126 using exponent bits. uint64_t ylogx_bits = bit_cast(ylogx); if ((ylogx_bits >> 47 & 0xffff) >= (bit_cast(126.0) >> 47)) { if (ylogx > 0x1.fffffffd1d571p+6) { return std::numeric_limits::infinity(); } if (ylogx <= -150.0) { return 0.0f; } } // --- exp2 --- // x = k/N + r with r in [-1/(2N), 1/(2N)], N=32. constexpr double kShift = 0x1.8p+52 / 32.0; // magic for round-to-nearest double kd = ylogx + kShift; uint64_t ki = bit_cast(kd); kd -= kShift; // k/N, rounded double rd = ylogx - kd; // exp2(x) = 2^(k/N) * (C0*r^3 + C1*r^2 + C2*r + 1) // = s * poly(r) uint64_t t = kPowExp2Tab[ki & 31]; t += ki << (52 - 5); // add exponent double s = bit_cast(t); double r2e = rd * rd; double pe = kPowExp2Poly[0] * rd + kPowExp2Poly[1]; double qe = kPowExp2Poly[2] * rd + 1.0; qe = pe * r2e + qe; return static_cast(qe * s); } // Hybrid double-precision pow core: table-assisted log2 + tableless exp2. // Float inputs/output, double internal arithmetic via DoubleVec. // // log2: SIMD range reduction (float/uint32) extracts a 4-bit table index per // lane. Scalar lookups fetch {1/c, log2(c)} from the 16-entry table, packed // into DoubleVec. The table narrows |r| to <0.03, enabling a degree-4 // polynomial (vs degree-10 tableless). 8 scalar loads total (4 lanes × 2 // values), always L1-hot (256 bytes). // exp2: tableless degree-5 polynomial via exp2_split (same as poly core). // // Works for both scalar (Flt=float) and SIMD (Flt=SseFloat, etc.). template DISPENSO_INLINE Flt pow_double_hybrid_core(Flt ax, Flt y) { using DV = DoubleVec; using IntT = IntType_t; using UintT = UintType_t; // --- Range reduction (SIMD, same as pow_double_core) --- // Note: does NOT handle subnormal ax (ix < 0x00800000). Callers must // catch subnormals in is_special and fall back to pow_double_poly_core. constexpr uint32_t kOff = 0x3f330000; UintT ix = bit_cast(ax); UintT tmp = ix - UintT(kOff); IntT i = (bit_cast(tmp) >> 19) & IntT(15); // 4-bit table index UintT top = tmp & UintT(0xff800000u); UintT iz = ix - top; IntT k = bit_cast(top) >> 23; // exponent (arithmetic shift) // --- Table lookups: 4 lanes × {invc, logc} = 8 scalar loads --- // Table is 16 × {double, double} = 256 bytes, stride 2 doubles per entry. const double* tab = reinterpret_cast(kPowLog2Tab); IntT i2 = i + i; // stride: 2 doubles per entry DV invc = DV::gather(tab, i2); // tab[2*i + 0] = invc DV logc = DV::gather(tab, i2 + IntT(1)); // tab[2*i + 1] = logc // --- log2 in double --- DV z = DV::from_float(bit_cast(iz)); DV r = z * invc - DV(1.0); DV y0 = logc + DV::from_float(Flt(k)); // Degree-4 Horner for log2 (chain too short for Estrin benefit). // log2(x) = y0 + r * P(r), |r| < 0.03. DV p(kPowLog2Poly[0]); p = fma(p, r, DV(kPowLog2Poly[1])); p = fma(p, r, DV(kPowLog2Poly[2])); p = fma(p, r, DV(kPowLog2Poly[3])); p = fma(p, r, DV(kPowLog2Poly[4])); DV logx = fma(p, r, y0); // --- y * log2(x) --- DV ylogx = DV::from_float(y) * logx; // Clamp to valid double exponent range. ylogx = clamp(ylogx, DV(-1022.0), DV(1023.0)); // --- Tableless exp2 in double --- auto [re, scale] = exp2_split(ylogx); // Degree-5 Horner for exp2 (chain too short for Estrin benefit). DV q(0x1.4308fabc18decp-13); q = fma(q, re, DV(0x1.5f07b4611e454p-10)); q = fma(q, re, DV(0x1.3b2b9fbd8970fp-7)); q = fma(q, re, DV(0x1.c6af6e92be375p-5)); q = fma(q, re, DV(0x1.ebfbdec29c82ap-3)); q = fma(q, re, DV(0x1.62e4302eeb44dp-1)); DV result = fma(scale, re * q, scale); return result.to_float(); } // Tableless double-precision pow core: exp2(y * log2(|x|)) entirely in double. // Float inputs/output, double internal arithmetic via DoubleVec. // // log2: Uses logarithmSep range reduction (float/uint32) to get exponent i and // mantissa m in [sqrt(0.5), sqrt(2)]. Then t = (double)m - 1.0 (exact in // double) and log2(1+t) is evaluated as t * P(t) where P is a degree-10 // Sollya fpminimax polynomial (~30 bits). // No tables, no division, no error tracking. // exp2: Tableless double-precision polynomial. Range reduce to integer n and // fractional r ∈ [-0.5, 0.5], then 2^x = 2^n * (1 + r*Q(r)) where Q is a // degree-5 Sollya fpminimax polynomial (~27 bits). No tables. // // Works for both scalar (Flt=float) and SIMD (Flt=SseFloat, etc.). template DISPENSO_INLINE Flt pow_double_poly_core(Flt ax, Flt y) { using DV = DoubleVec; // --- Range reduction (float/uint32) --- // logarithmSep gives us exponent i and mantissa m in [sqrt(0.5), sqrt(2)]. auto [xi, i_f, m] = logarithmSep(ax); // --- Widen to double BEFORE subtraction --- // m - 1 in float loses ~17 bits when m ≈ 1 (catastrophic cancellation). // In double, (double)m - 1.0 is exact since every float is exactly // representable in double, and the result fits in 24 mantissa bits. DV t = DV::from_float(m) - DV(1.0); DV i_d = DV::from_float(i_f); // --- Double polynomial: log2(1+t) = t * P(t), degree 10 (Estrin) --- // Sollya fpminimax on [-0.293, 0.414], double precision. // Sup-norm error: 1.10e-9 (~30 bits). // Estrin's scheme evaluates independent sub-polynomials in parallel, // reducing latency vs Horner by exploiting ILP (multiple independent FMAs // at each level feed both FMA ports simultaneously). // P(t) = (c0+c1*t) + t²*(c2+c3*t) + t⁴*(c4+c5*t) + t⁶*(c6+c7*t) // + t⁸*(c8+c9*t+c10*t²) // // Level 0: t², and five independent coefficient pairs. DV t2 = t * t; DV p01 = fma(DV(-0x1.7154762acdfffp-1), t, DV(0x1.71547656a2495p+0)); DV p23 = fma(DV(-0x1.71548ef022a84p-2), t, DV(0x1.ec707f060e35ep-2)); DV p45 = fma(DV(-0x1.ec7a94d1207adp-3), t, DV(0x1.278083c622fbcp-2)); DV p67 = fma(DV(-0x1.6e5f6f8760a8ap-3), t, DV(0x1.a3ee210f50f0fp-3)); DV p89 = fma(DV(-0x1.5a6abc244ec79p-3), t, DV(0x1.60552656955f2p-3)); // Level 1: t⁴, combine pairs with t². DV t4 = t2 * t2; DV q03 = fma(p23, t2, p01); DV q47 = fma(p67, t2, p45); DV q8A = fma(DV(0x1.8e0e8c535162ap-4), t2, p89); // c10*t² + (c8+c9*t) // Level 2: t⁸, combine quads with t⁴. DV t8 = t4 * t4; DV r07 = fma(q47, t4, q03); // Level 3: final combination. DV p = fma(q8A, t8, r07); // log2(1+t) = t * P(t); log2(x) = i + log2(m) DV log2x = i_d + t * p; // --- y * log2(x) --- DV ylogx = DV::from_float(y) * log2x; // Clamp to valid double exponent range [-1022, 1023]. // This prevents invalid bit patterns in exp2_split while preserving // correct float narrowing: 2^(-1022) → 0.0f, 2^1023 → inf. ylogx = clamp(ylogx, DV(-1022.0), DV(1023.0)); // --- Tableless exp2 in double --- auto [r, scale] = exp2_split(ylogx); // 2^r = 1 + r * Q(r), degree-5 Sollya fpminimax on [-0.5, 0.5]. // Q(r) ≈ (2^r - 1)/r. Sup-norm error: 8.88e-9 (~27 bits). DV q(0x1.4308fabc18decp-13); // e5 q = fma(q, r, DV(0x1.5f07b4611e454p-10)); // e4 q = fma(q, r, DV(0x1.3b2b9fbd8970fp-7)); // e3 q = fma(q, r, DV(0x1.c6af6e92be375p-5)); // e2 q = fma(q, r, DV(0x1.ebfbdec29c82ap-3)); // e1 q = fma(q, r, DV(0x1.62e4302eeb44dp-1)); // e0 // 2^ylogx = scale * (1 + r * q) = scale + scale * r * q DV result = fma(scale, r * q, scale); return result.to_float(); } // --- Scalar pow IEEE 754 special-case helpers --- // Handle y = 0, inf, or NaN for scalar pow bounds path. // Returns true if the result is determined (stored in *out). // // Bit tricks used throughout: // 2u * val — left-shifts by 1, discarding the sign bit. This maps // ±0 to 0, ±inf to 0xFF000000, and NaN to > 0xFF000000. // 2u * val - 1u — maps ±0 to UINT_MAX (wraps), normal floats to a // positive range, and inf/NaN to >= 0xFF000000 - 1. // So "2u*val - 1u >= 2u*0x7f800000u - 1u" tests whether // val is zero, inf, or NaN in one branch-free comparison. // ix & 0x7FFFFFFFu — clears the sign bit, giving |x| as raw bits. DISPENSO_INLINE bool pow_scalar_y_special(float x, float y, uint32_t ix, uint32_t iy, float* out) { if (2u * iy == 0u) { *out = 1.0f; return true; // pow(x, ±0) = 1 } if (ix == 0x3f800000u) { *out = 1.0f; return true; // pow(1, y) = 1 even for NaN y } // Either x or y is NaN → propagate via x + y (IEEE 754 NaN arithmetic). if (2u * (ix & 0x7fffffffu) > 2u * 0x7f800000u || 2u * iy > 2u * 0x7f800000u) { *out = x + y; return true; } // |x| == 1 (either +1 or -1) and y is ±inf → result is 1. if (2u * (ix & 0x7fffffffu) == 2u * 0x3f800000u) { *out = 1.0f; return true; } // y is ±inf (NaN cases handled above). Result is 0 or +inf depending on // whether |x| < 1 and the sign of y: // |x| < 1, y = +inf → 0 |x| > 1, y = +inf → +inf // |x| < 1, y = -inf → +inf |x| > 1, y = -inf → 0 if ((2u * (ix & 0x7fffffffu) < 2u * 0x3f800000u) == !(iy & 0x80000000u)) { *out = 0.0f; return true; } *out = y * y; // +inf (y*y overflows to +inf for finite y, identity for ±inf) return true; } // Handle negative/zero/inf/subnormal x for scalar pow bounds path. // Called when x is outside the normal positive range (the fast-path condition // in the caller already filtered normal positive x). Returns true if the // result is determined (stored in *out). Otherwise adjusts ix (to |x|, with // subnormals normalized) and sign_bias for the core computation. DISPENSO_INLINE bool pow_scalar_x_special(uint32_t& ix, uint32_t iy, uint32_t& sign_bias, float* out) { if (ix & 0x80000000u) { // Negative x: check integer parity of y to determine sign of result. // checkint returns: 0 = not integer, 1 = even integer, 2 = odd integer. int32_t yint = checkint(iy); if (yint == 2) sign_bias = 0x80000000u; // odd integer y → negate final result ix &= 0x7fffffffu; // work with |x| from here if (ix == 0 || ix == 0x7f800000u) { // -0 or -inf: |x|^2 maps 0→0 and inf→inf. Then apply sign and // reciprocal (negative y inverts: pow(-0, -3) = -inf). float x2 = bit_cast(ix) * bit_cast(ix); if (iy & 0x80000000u) x2 = 1.0f / x2; *out = bit_cast(bit_cast(x2) ^ sign_bias); return true; } if (yint == 0) { *out = std::numeric_limits::quiet_NaN(); // neg ^ non-integer = NaN return true; } // Negative finite nonzero x with integer y: fall through to core with |x|. } // Test for +0 or +inf. "2u*ix - 1u >= 2u*0x7f800000u - 1u" is true when // ix is 0 (wraps to UINT_MAX) or >= 0x7f800000 (inf/NaN, but NaN was // already handled by the caller's y-special path). if (2u * ix - 1u >= 2u * 0x7f800000u - 1u) { float x2 = bit_cast(ix) * bit_cast(ix); if (iy & 0x80000000u) x2 = 1.0f / x2; *out = x2; return true; } if (ix < 0x00800000u) { // Subnormal x: scale by 2^23 to normalize, then subtract 23 from the // biased exponent so the log2 core sees the correct value. ix = bit_cast(bit_cast(ix) * 0x1p23f); ix -= 23u << 23; } return false; // Fall through to core computation with adjusted ix. } // SIMD pow fixup for special inputs (zero, subnormal, inf, NaN). // The double-precision cores produce garbage for these because logarithmSep's // range reduction (exponent extraction via integer subtract) is undefined for // zero/inf/NaN bit patterns. This cold-path fixup corrects only the affected // lanes; the branch is almost never taken for normal workloads. template DISPENSO_INLINE void pow_simd_special_fixup(Flt ax, Flt y, Flt& r) { using IntT = IntType_t; using UintT = UintType_t; IntT axi = bit_cast(ax); // Detect special lanes: subnormal (exponent field 0, mantissa nonzero), // zero (all bits 0), inf (0x7F800000), NaN (> 0x7F800000). // nonnormal() checks exponent == 0xFF (inf/NaN). Combined with < 0x00800000 // (subnormal/zero), this covers all non-normal inputs. auto is_special = (axi < IntT(0x00800000)) | nonnormal(axi); if (DISPENSO_EXPECT(any_true(is_special), 0)) { // Subnormals: the hybrid core's range reduction (ix - kOff) can't handle // exponent-0 bit patterns. Recompute these lanes via poly core, whose // logarithmSep prescales subnormals by 2^23 before extraction. auto is_subnorm = (axi > IntT(0)) & (axi < IntT(0x00800000)); if (any_true(is_subnorm)) { Flt r_sub = pow_double_poly_core(ax, y); r = FloatTraits::conditional(is_subnorm, r_sub, r); } // Zero/inf/NaN lanes: use an integer trick that swaps 0↔inf in one op. // IEEE 754 bit layout: 0 = 0x00000000, +inf = 0x7F800000. // 0x7F800000 - 0x00000000 = 0x7F800000 (+inf) → pow(0, pos) = 0, pow(0, neg) = inf // 0x7F800000 - 0x7F800000 = 0x00000000 (0) → pow(inf, pos) = inf, pow(inf, neg) = 0 // 0x7F800000 - (NaN bits) wraps negative, which reinterprets as NaN // (sign bit irrelevant for NaN). So: // y > 0 → result = ax (0 stays 0, inf stays inf) // y < 0 → result = flipped (0↔inf swap) // NaN → stays NaN either way auto is_zinf = (axi == IntT(0)) | nonnormal(axi); UintT ax_bits = bit_cast(ax); Flt flipped = bit_cast(UintT(0x7F800000u) - ax_bits); Flt special_r = FloatTraits::conditional(y < 0.0f, flipped, ax); r = FloatTraits::conditional(is_zinf, special_r, r); } } // Scalar-exponent fast paths: exact results for common y values and // binary squaring for small integer exponents. Sets y_is_int and y_is_odd // as output for the caller's sign-handling path when returning false. template DISPENSO_INLINE bool pow_scalar_y_fast_path(Flt x, float y, bool& y_is_int, bool& y_is_odd, Flt& out) { using UintT = UintType_t; if (y == 0.0f) { out = Flt(1.0f); return true; } if (y == 1.0f) { out = x; return true; } if (y == -1.0f) { out = Flt(1.0f) / x; return true; } if (y == 0.5f) { constexpr float kNaN = std::numeric_limits::quiet_NaN(); auto negx = x < 0.0f; out = FloatTraits::conditional(negx, Flt(kNaN), FloatTraits::sqrt(x)); return true; } if (y == 2.0f) { out = x * x; return true; } float ay = std::fabs(y); y_is_int = (std::floor(y) == y); y_is_odd = y_is_int && (std::fmod(ay, 2.0f) == 1.0f); if (y_is_int && ay < 64.0f) { // Binary squaring with uniform SIMD multiplies. int32_t n = static_cast(ay); Flt result(1.0f); Flt base = fabs(x); while (n > 0) { if (n & 1) result = result * base; base = base * base; n >>= 1; } if (y < 0.0f) result = Flt(1.0f) / result; if (y_is_odd) { UintT xsign = bit_cast(x) & UintT(0x80000000u); result = bit_cast(bit_cast(result) ^ xsign); } out = result; return true; } return false; } } // namespace detail // Compile-time check that Flt is a supported type: float, a SIMD float wrapper, // or a raw intrinsic type that maps to one via SimdTypeFor. template constexpr void assert_float_type() { // For raw intrinsics (__m128 etc.), SimdType_t maps to the wrapper (SseFloat etc.). // For wrappers and float, SimdType_t is identity. Check that FloatTraits exists // by verifying the kOne constant is present (only defined for float-based types). static_assert( FloatTraits>::kOne == 0x3f800000, "fast_math only supports float and float-based SIMD types (SseFloat, AvxFloat, etc.)"); } /** * @brief Hardware-delegated square root. * @tparam Flt float or SIMD float type. * @tparam AccuracyTraits Accepted but ignored; result is always 0 ULP. * @param x Input value. * @return Square root of @p x. Compatible with all SIMD backends. */ template DISPENSO_INLINE Flt sqrt(Flt x) { assert_float_type(); if constexpr (!std::is_same_v>) { return FloatTraits>::sqrt(SimdType_t(x)).v; } else { return std::sqrt(x); } } /** * @brief Cube root approximation. * @tparam Flt float or SIMD float type. * @tparam AccuracyTraits Default: 12 ULP, MaxAccuracy: 3 ULP. * kBoundsValues: returns input for inf/NaN/zero. * @param x Input value (all float domain, including denormals). * @return Cube root of @p x. Compatible with all SIMD backends. */ template DISPENSO_INLINE Flt cbrt(Flt x) { assert_float_type(); if constexpr (!std::is_same_v>) { // Raw intrinsic type (__m128, Vec512, etc.) — forward to wrapper. return cbrt, AccuracyTraits>(SimdType_t(x)).v; } else if constexpr (std::is_same_v) { // Scalar path: branchless for the normal case. // Denormals get prescale=2^24, postscale=1/256 (cmov). // Zero is zeroed via AND mask (no blend). uint32_t ui = bit_cast(x); uint32_t usgn = ui & 0x80000000u; uint32_t uabs = ui & 0x7fffffffu; if constexpr (AccuracyTraits::kBoundsValues) { if (DISPENSO_EXPECT((uabs & 0x7f800000u) == 0x7f800000u, 0)) { return x; // inf/NaN } if (DISPENSO_EXPECT(uabs == 0u, 0)) { return x; // ±0 (preserves -0) } } // expZero covers both zero and denormal; zero is handled by AND mask below. bool expZero = (uabs & 0x7f800000u) == 0u; float prescale = expZero ? 16777216.0f : 1.0f; float postscale = expZero ? (1.0f / 256.0f) : 1.0f; float absX = bit_cast(uabs) * prescale; int32_t iScaled = static_cast(bit_cast(absX)); float sgnx = bit_cast(usgn | 0x3f800000u); float result = sgnx / detail::rcp_cbrt(absX, iScaled); result *= postscale; // Zero mask: AND zeros out result for zero input, OR sign bit back in // so that cbrt(-0) = -0. uint32_t zeroMask = 0u - static_cast(uabs != 0u); return bit_cast((bit_cast(result) & zeroMask) | usgn); } else { // SIMD wrapper path: prescale/postscale multipliers + AND mask for zero. using IntT = IntType_t; IntT i = bit_cast(x); IntT isgn = i & 0x80000000; i &= 0x7fffffff; // expZero covers both zero and denormal. auto expZero = (i & 0x7f800000) == 0; Flt prescale = FloatTraits::conditional(expZero, Flt(16777216.0f), Flt(1.0f)); Flt postscale = FloatTraits::conditional(expZero, Flt(1.0f / 256.0f), Flt(1.0f)); Flt absX = bit_cast(i) * prescale; IntT iScaled = bit_cast(absX); Flt sgnx = bit_cast(isgn | FloatTraits::kOne); Flt result = sgnx / detail::rcp_cbrt(absX, iScaled); result = result * postscale; // Zero: AND mask zeroes result, OR sign bit back so cbrt(-0) = -0. auto zeroMask = bool_as_mask(i != 0); result = bit_cast((bit_cast(result) & zeroMask) | isgn); if constexpr (AccuracyTraits::kBoundsValues) { return FloatTraits::conditional(nonnormal(iScaled), x, result); } return result; } } /** * @brief Decompose a float into mantissa and exponent (bit-accurate for normals). * @tparam Flt float or SIMD float type. * @tparam AccuracyTraits Accepted but ignored; result is always bit-accurate. * @param x Input value. * @param eptr Pointer to receive the exponent. * @return Mantissa in [0.5, 1) for normal values. Returns @p x unchanged for * inf/NaN/zero/subnormal. Subnormals are passed through with *eptr = 0 * rather than normalized into [0.5, 1) as std::frexp does. This means the * mantissa contract is violated for subnormals, but ldexp(frexp(x)) == x * still holds (the round-trip is an identity). Code that depends on a * normalized mantissa (e.g., counting significant bits) should not use * subnormal inputs. */ template DISPENSO_INLINE Flt frexp(Flt x, IntType_t* eptr) { assert_float_type(); if constexpr (!std::is_same_v>) { return frexp>(SimdType_t(x), eptr).v; } else { using IntT = IntType_t; IntT ix = bit_cast>(x); *eptr = 0; return FloatTraits::conditional( nonnormalOrZero(ix), x, detail::frexpImpl(ix, eptr)); } } /** * @brief Multiply a float by a power of 2 (bit-accurate). * @tparam Flt float or SIMD float type. * @tparam AccuracyTraits Accepted but ignored; result is always bit-accurate. * @param x Input value. * @param e Exponent to apply (x * 2^e). * @return @p x * 2^e. Returns @p x unchanged for inf/NaN. * Compatible with all SIMD backends. */ template DISPENSO_INLINE Flt ldexp(Flt x, IntType_t e) { assert_float_type(); if constexpr (!std::is_same_v>) { return ldexp>(SimdType_t(x), e).v; } else { using IntT = IntType_t; IntT hx = bit_cast(x); return FloatTraits::conditional(nonnormal(hx), x, detail::ldexpImpl(hx, e)); } } /** * @brief Arc cosine approximation. * @tparam Flt float or SIMD float type. * @tparam AccuracyTraits Default: 3 ULP. kMaxAccuracy: 2 ULP (degree-8 polynomial). * @param x Input value in [-1, 1]. * @return Arc cosine of @p x in radians. Compatible with all SIMD backends. */ template DISPENSO_INLINE Flt acos(Flt x) { assert_float_type(); if constexpr (!std::is_same_v>) { return acos, AccuracyTraits>(SimdType_t(x)).v; } else { BoolType_t xneg = x < 0.0f; // abs UintType_t xi = bit_cast>(x); xi &= 0x7fffffff; x = bit_cast(xi); // acos(x) = sqrt(1-x) * P(x), where P approximates acos(x)/sqrt(1-x) on [0, 1). // Horner (not Estrin): Estrin regresses acos ULP from 3 to 4. Flt y; if constexpr (AccuracyTraits::kMaxAccuracy || AccuracyTraits::kBoundsValues) { // Sollya fpminimax(acos(x)/sqrt(1-x), 8, [|SG...|], [0, 1-1b-23]): // sup-norm error < 2^-25. y = dispenso::fast_math::hornerEval( x, 0x1.d129f2p-11f, -0x1.3ca11ap-8f, 0x1.9a3376p-7f, -0x1.6a1a08p-6f, 0x1.10b838p-5f, -0x1.a0365ep-5f, 0x1.6cce04p-4f, -0x1.b7820cp-3f, 0x1.921fb6p0f); } else { // Sollya fpminimax(acos(x)/sqrt(1-x), 7, [|SG...|], [0, 1-1b-23]): // sup-norm error < 2^-24. y = dispenso::fast_math::hornerEval( x, -0x1.39a988p-10f, 0x1.a50fdp-8f, -0x1.120206p-6f, 0x1.f5a1e2p-6f, -0x1.9a1efp-5f, 0x1.6c5d48p-4f, -0x1.b77e7ep-3f, 0x1.921fb4p0f); } auto sqrt1mx = FloatTraits::sqrt(1.0f - x); return FloatTraits::conditional( xneg, FloatTraits::fma(y, -sqrt1mx, Flt(kPi)), y * sqrt1mx); } } /** * @brief Arc sine approximation. * @tparam Flt float or SIMD float type. * @tparam AccuracyTraits Default: 2 ULP. kMaxAccuracy: 1 ULP (higher-degree polynomials). * @param x Input value in [-1, 1]. * @return Arc sine of @p x in radians. Compatible with all SIMD backends. */ template DISPENSO_INLINE Flt asin(Flt x) { assert_float_type(); if constexpr (!std::is_same_v>) { return asin, AccuracyTraits>(SimdType_t(x)).v; } else { using IntT = IntType_t; IntT xi = bit_cast(x); IntT sgnbit = xi & 0x80000000; xi &= 0x7fffffff; x = bit_cast(xi); Flt ret; if constexpr (std::is_same_v) { if (xi > 0x3f000000) { // x > 0.5 ret = detail::asin_pt5_1(x); } else { ret = detail::asin_0_pt5(x); } } else { // Use compensating sum for better rounding. auto y = detail::asin_pt5_1(x); auto z = detail::asin_0_pt5(x); ret = FloatTraits::conditional(xi > 0x3f000000, y, z); // choose between correct estimate } return bit_cast(sgnbit | bit_cast(ret)); } } /** * @brief Sine approximation. * @tparam Flt float or SIMD float type. * @tparam AccuracyTraits kMaxAccuracy uses higher-precision range reduction. * Default: 2 ULP in [-2^20 pi, 2^20 pi]; accuracy degrades for larger |x|. * @param x Input value in radians (all float domain). * @return Sine of @p x. Compatible with all SIMD backends. * * Scalar: pi/4 reduction with quadrant branching (lowest latency). * SIMD: pi reduction, single sin polynomial, sign flip (no blending, ~30-40% faster). */ template DISPENSO_INLINE Flt sin(Flt x) { assert_float_type(); if constexpr (!std::is_same_v>) { return sin, AccuracyTraits>(SimdType_t(x)).v; } else if constexpr (std::is_same_v) { // Scalar: pi/4 reduction + branch selects sin or cos polynomial per quadrant. constexpr float kPi_2hi = 1.57079625f; constexpr float kPi_2med = 7.54978942e-08f; constexpr float kPi_2lo = 5.39032794e-15f; constexpr float k2_pi = 0.636619747f; Flt j = detail::rangeReduce2(x, k2_pi, kPi_2hi, kPi_2med, kPi_2lo); return detail::sincos_pi_impl(x, j, 0); } else { // SIMD: pi reduction — single sin_pi_2 polynomial, sign flip, no blending. constexpr float kPi_hi = 3.1415925f; constexpr float kPi_med = 1.50995788e-07f; constexpr float kPi_lo = 1.07806559e-14f; constexpr float k1_pi = 0.318309886f; Flt j = detail::rangeReduce2(x, k1_pi, kPi_hi, kPi_med, kPi_lo); return detail::sin_pi_reduction(x, j); } } /** * @brief Cosine approximation. * @tparam Flt float or SIMD float type. * @tparam AccuracyTraits kMaxAccuracy uses 4-part Cody-Waite range reduction. * Default: 2 ULP in [-2^15 pi, 2^15 pi]. * MaxAccuracy: 2 ULP in [-2^20 pi, 2^20 pi]. * @param x Input value in radians (all float domain). * @return Cosine of @p x. Compatible with all SIMD backends. * * Scalar: pi/4 reduction with quadrant branching (lowest latency). * SIMD: offset-pi reduction (q = 2*trunc(|x|/pi)+1), single sin polynomial, * sign flip (no blending, ~30% faster). */ template DISPENSO_INLINE Flt cos(Flt x) { assert_float_type(); if constexpr (!std::is_same_v>) { return cos, AccuracyTraits>(SimdType_t(x)).v; } else if constexpr (std::is_same_v) { // Scalar: pi/4 reduction + branch selects sin or cos polynomial per quadrant. Flt j; if constexpr (AccuracyTraits::kMaxAccuracy) { constexpr float kPi_2hi = 1.57079625f; constexpr float kPi_2medh = 7.54978942e-08f; constexpr float kPi_2medl = 5.39030253e-15f; constexpr float kPi_2lo = 3.28200367e-22f; constexpr float k2_pi = 0.636619747f; j = detail::rangeReduce3(x, k2_pi, kPi_2hi, kPi_2medh, kPi_2medl, kPi_2lo); } else { constexpr float kPi_2hi = 1.57079625f; constexpr float kPi_2med = 7.54978942e-08f; constexpr float kPi_2lo = 5.39032794e-15f; constexpr float k2_pi = 0.636619747f; j = detail::rangeReduce2(x, k2_pi, kPi_2hi, kPi_2med, kPi_2lo); } return detail::sincos_pi_impl(x, j, 1); } else { // SIMD: offset-pi reduction maps cos zeros to x_r = 0, then evaluates sin_pi_2. auto fma = FloatTraits::fma; Flt y = fabs(x); constexpr float k1_pi = 0.318309886f; auto qi = convert_to_int_trunc(y * k1_pi); qi = qi + qi + 1; Flt qf = Flt(qi); if constexpr (AccuracyTraits::kMaxAccuracy) { constexpr float kPi_2hi = 1.57079625f; constexpr float kPi_2medh = 7.54978942e-08f; constexpr float kPi_2medl = 5.39030253e-15f; constexpr float kPi_2lo = 3.28200367e-22f; y = fma(qf, Flt(-kPi_2hi), y); y = fma(qf, Flt(-kPi_2medh), y); y = fma(qf, Flt(-kPi_2medl), y); y = fma(qf, Flt(-kPi_2lo), y); } else { constexpr float kPi_2hi = 1.57079625f; constexpr float kPi_2med = 7.54978942e-08f; constexpr float kPi_2lo = 5.39032794e-15f; y = fma(qf, Flt(-kPi_2hi), y); y = fma(qf, Flt(-kPi_2med), y); y = fma(qf, Flt(-kPi_2lo), y); } return detail::cos_offset_pi_reduction(y, qi); } } /** * @brief Simultaneous sine and cosine, sharing range reduction. * @tparam Flt float or SIMD float type. * @tparam AccuracyTraits Same behavior as sin/cos: kMaxAccuracy controls * range reduction precision. Default: 2 ULP each. * @param x Input value in radians (all float domain). * @param[out] out_sin Pointer to receive sin(x). * @param[out] out_cos Pointer to receive cos(x). * @note Can be faster than separate sin() + cos() calls due to shared range * reduction and polynomial evaluation (SIMD). Compatible with all SIMD backends. */ template DISPENSO_INLINE void sincos(Flt x, Flt* out_sin, Flt* out_cos) { assert_float_type(); if constexpr (!std::is_same_v>) { SimdType_t s, c; sincos, AccuracyTraits>(SimdType_t(x), &s, &c); *out_sin = s.v; *out_cos = c.v; } else { Flt j; if constexpr (AccuracyTraits::kMaxAccuracy) { constexpr float kPi_2hi = 1.57079625f; constexpr float kPi_2medh = 7.54978942e-08f; constexpr float kPi_2medl = 5.39030253e-15f; constexpr float kPi_2lo = 3.28200367e-22f; constexpr float k2_pi = 0.636619747f; j = detail::rangeReduce3(x, k2_pi, kPi_2hi, kPi_2medh, kPi_2medl, kPi_2lo); } else { constexpr float kPi_2hi = 1.57079625f; constexpr float kPi_2med = 7.54978942e-08f; constexpr float kPi_2lo = 5.39032794e-15f; constexpr float k2_pi = 0.636619747f; j = detail::rangeReduce2(x, k2_pi, kPi_2hi, kPi_2med, kPi_2lo); } detail::sincos_pi_impl_both(x, j, out_sin, out_cos); } } /** * @brief Compute sin(pi * x) with exact range reduction. * @tparam Flt float or SIMD float type. * @tparam AccuracyTraits Accepted but ignored (range reduction is always exact). * @param x Input value (all float domain). Exact at integers (returns 0) and * half-integers (returns ±1). ~2 ULP for general inputs. * @return sin(pi * x). Compatible with all SIMD backends. */ template DISPENSO_INLINE Flt sinpi(Flt x) { assert_float_type(); if constexpr (!std::is_same_v>) { return sinpi>(SimdType_t(x)).v; } else { // Exact range reduction: j = round(2*x), r = x - j/2, t = r * pi. constexpr float kMagic = FloatTraits::kMagic; // 1.5 * 2^23 Flt j = (2.0f * x + kMagic) - kMagic; // round(2*x) Flt r = x - j * 0.5f; // exact for |x| < 2^22 Flt t = r * kPi; // only error source return detail::sincos_pi_impl(t, j, 0); } } /** * @brief Compute cos(pi * x) with exact range reduction. * @tparam Flt float or SIMD float type. * @tparam AccuracyTraits Accepted but ignored (range reduction is always exact). * @param x Input value (all float domain). Exact at integers (returns ±1) and * half-integers (returns 0). ~2 ULP for general inputs. * @return cos(pi * x). Compatible with all SIMD backends. */ template DISPENSO_INLINE Flt cospi(Flt x) { assert_float_type(); if constexpr (!std::is_same_v>) { return cospi>(SimdType_t(x)).v; } else { constexpr float kMagic = FloatTraits::kMagic; Flt j = (2.0f * x + kMagic) - kMagic; Flt r = x - j * 0.5f; Flt t = r * kPi; return detail::sincos_pi_impl(t, j, 1); } } /** * @brief Simultaneous sin(pi*x) and cos(pi*x) with exact range reduction. * @tparam Flt float or SIMD float type. * @tparam AccuracyTraits Accepted but ignored. * @param x Input value (all float domain). * @param[out] out_sin Pointer to receive sin(pi * x). * @param[out] out_cos Pointer to receive cos(pi * x). * @note Faster than separate sinpi() + cospi() calls. ~2 ULP each. * Compatible with all SIMD backends. */ template DISPENSO_INLINE void sincospi(Flt x, Flt* out_sin, Flt* out_cos) { assert_float_type(); if constexpr (!std::is_same_v>) { SimdType_t s, c; sincospi>(SimdType_t(x), &s, &c); *out_sin = s.v; *out_cos = c.v; } else { constexpr float kMagic = FloatTraits::kMagic; Flt j = (2.0f * x + kMagic) - kMagic; Flt r = x - j * 0.5f; Flt t = r * kPi; detail::sincos_pi_impl_both(t, j, out_sin, out_cos); } } /** * @brief Tangent approximation. * @tparam Flt float or SIMD float type. * @tparam AccuracyTraits Default: 3 ULP in [-256K pi, 256K pi]. * kMaxAccuracy uses 4-part Cody-Waite reduction for wider accurate range. * @param x Input value in radians; undefined at odd multiples of pi/2. * @return Tangent of @p x. Compatible with all SIMD backends. */ template DISPENSO_INLINE Flt tan(Flt x) { assert_float_type(); if constexpr (!std::is_same_v>) { return tan, AccuracyTraits>(SimdType_t(x)).v; } else { Flt j; if constexpr (AccuracyTraits::kMaxAccuracy) { constexpr float kPi_2hi = 1.57079625f; constexpr float kPi_2medh = 7.54978942e-08f; constexpr float kPi_2medl = 5.39030253e-15f; constexpr float kPi_2lo = 3.28200367e-22f; constexpr float k2_pi = 0.636619747f; j = detail::rangeReduce3(x, k2_pi, kPi_2hi, kPi_2medh, kPi_2medl, kPi_2lo); } else { constexpr float kPi_2hi = 1.57079625f; constexpr float kPi_2med = 7.54978942e-08f; constexpr float kPi_2lo = 5.39032794e-15f; constexpr float k2_pi = 0.636619747f; j = detail::rangeReduce2(x, k2_pi, kPi_2hi, kPi_2med, kPi_2lo); } return detail::tan_pi_2_impl(x, j); } } /** * @brief Arc tangent approximation. * @tparam Flt float or SIMD float type. * @tparam AccuracyTraits Default: 3 ULP. kMaxAccuracy: 2 ULP (degree-11 polynomial). * @param x Input value (all float domain). * @return Arc tangent of @p x in radians. Compatible with all SIMD backends. */ template DISPENSO_INLINE Flt atan(Flt x) { assert_float_type(); if constexpr (!std::is_same_v>) { return atan, AccuracyTraits>(SimdType_t(x)).v; } else { using IntT = IntType_t; IntT xi = bit_cast(x); IntT sgnbit = xi & 0x80000000; xi &= 0x7fffffff; x = bit_cast(xi); auto flip = x > 1.0f; // TODO(bbudge): Verify, Some compilers (MSVC) don't optimize this as well, check for scalar // case. if constexpr (std::is_same_v) { if (flip) { x = 1.0f / x; } } else { x = FloatTraits::conditional(flip, 1.0f / x, x); } auto y = detail::atan_poly(x); y = FloatTraits::conditional(flip, kPi_2 - y, y); y = bit_cast(bit_cast(y) | sgnbit); return y; } } /** * @brief Base-2 exponential approximation. * @tparam Flt float or SIMD float type. * @tparam AccuracyTraits Default: 1 ULP. kMaxAccuracy has no additional effect. * kBoundsValues: returns NaN for NaN, clamps input to avoid overflow. * @param x Input value; [-127, 128] for normal output. * @return 2^x. Compatible with all SIMD backends. */ template DISPENSO_INLINE Flt exp2(Flt x) { assert_float_type(); if constexpr (!std::is_same_v>) { return exp2, AccuracyTraits>(SimdType_t(x)).v; } else { using IntT = IntType_t; Flt xf; IntT xi; if constexpr (AccuracyTraits::kBoundsValues) { if constexpr (std::is_same_v) { if (!(x == x)) { return x; } x = clamp_allow_nan(x, -127.0f, 128.0f); xf = floor_small(x); xi = convert_to_int(xf); } else { // bool_as_mask converts any comparison result (lane-wide or __mmask16) // to IntType uniformly across all SIMD backends. IntType_t nonanmask = bool_as_mask>(x == x); x = clamp_allow_nan(x, Flt(-127.0f), Flt(128.0f)); xf = floor_small(x); xi = nonanmask & convert_to_int(xf); } } else { xf = floor_small(x); xi = convert_to_int(xf); } x -= xf; xi += 127; xi <<= 23; Flt powxi = bit_cast(xi); Flt y = dispenso::fast_math::hornerEval( x, 0x1.c41242p-13f, 0x1.4748aep-10f, 0x1.3cfb6p-7f, 0x1.c68b2ep-5f, 0x1.ebfd38p-3f, 0x1.62e42cp-1f, 0x1p0f); return powxi * y; } } /** * @brief Natural exponential approximation. * @tparam Flt float or SIMD float type. * @tparam AccuracyTraits Default: 3 ULP. MaxAccuracy: 1 ULP (Sollya-optimized * polynomial + Norbert Juffa's rounding technique). * kBoundsValues: returns 0 for large negative, inf for large positive, NaN for NaN. * @param x Input value; [-89, 89] for normal output. * @return e^x. Compatible with all SIMD backends. */ template DISPENSO_INLINE Flt exp(Flt x) { assert_float_type(); if constexpr (!std::is_same_v>) { return exp, AccuracyTraits>(SimdType_t(x)).v; } else { using IntT = IntType_t; using UintT = UintType_t; constexpr float k1_ln2 = k1_Ln2; constexpr float kLn2hi = 6.93145752e-1f; constexpr float kLn2lo = 1.42860677e-6f; if constexpr (AccuracyTraits::kMaxAccuracy || AccuracyTraits::kBoundsValues) { // Rounding algorithm courtesy Norbert Juffa, https://stackoverflow.com/a/40519989/830441 // exp(x) = 2**i * exp(f); i = rintf (x / log(2)) Flt f = x; Flt j = detail::rangeReduce(f, k1_ln2, kLn2hi, kLn2lo); IntT i = convert_to_int(j); // approximate r = exp(f) on interval [-log(2)/2, +log(2)/2] Flt y = dispenso::fast_math::hornerEval( f, 0x1.6850e4p-10f, 0x1.123bccp-7f, 0x1.555b98p-5f, 0x1.55548ep-3f, 0x1.fffff8p-2f, 1.f, 1.f); // exp(x) = 2**i * y (with rounding) auto ipos = i > IntT(0); UintT ia = FloatTraits::conditional(ipos, UintT(0u), UintT(0x83000000u)); Flt s = bit_cast(UintT(0x7f000000u) + ia); Flt t = bit_cast((UintT(i) << 23) - ia); y = y * s; y = y * t; if constexpr (AccuracyTraits::kBoundsValues) { if constexpr (std::is_same_v) { auto zero = (x < -89.0f); auto overflow = (x > 89.0f); if ((zero || overflow)) { Flt orbits = bit_cast(bool_apply_or_zero(overflow, 0x7f800000)); y = FloatTraits::conditional(zero | overflow, orbits, y); } } else { auto zero = (x < -89.0f); auto overflow = (x > 89.0f); IntT orbits = bool_apply_or_zero(overflow, IntT(0x7f800000)); auto mask = bool_as_mask(zero) | bool_as_mask(overflow); y = FloatTraits::conditional(mask, bit_cast(orbits), y); } } return y; } else { Flt j = detail::rangeReduceFloor(x, k1_ln2, kLn2hi, kLn2lo); IntT xi = convert_to_int(j); xi += 127; xi <<= 23; Flt powxi = bit_cast(xi); // approximate r = exp(f) on interval [0, ln2] Flt y = dispenso::fast_math::hornerEval( x, 0x1.8014dp-7f, 0x1.3f3846p-5f, 0x1.57481ep-3f, 0x1.ffd96ep-2f, 0x1.000086p0f, 0x1.fffffep-1f); return powxi * y; } } } /** * @brief Base-10 exponential approximation. * @tparam Flt float or SIMD float type. * @tparam AccuracyTraits Default: 2 ULP. kBoundsValues: handles NaN. * @param x Input value; [-38, 38] for normal output. * @return 10^x. Compatible with all SIMD backends. */ template DISPENSO_INLINE Flt exp10(Flt x) { assert_float_type(); if constexpr (!std::is_same_v>) { return exp10, AccuracyTraits>(SimdType_t(x)).v; } else { using IntT = IntType_t; constexpr double kLog10of2d = 0.3010299956639812; constexpr float k1_log10of2 = static_cast(1.0 / kLog10of2d); constexpr float kLog10of2 = static_cast(kLog10of2d); constexpr float kLog10of2lo = static_cast(kLog10of2d - kLog10of2); Flt j; IntT xi; if constexpr (AccuracyTraits::kBoundsValues) { IntType_t nonanmask = bool_as_mask>(x == x); x = clamp_allow_nan(x, Flt(-38.23080944932561f), Flt(38.53183944498959f)); j = detail::rangeReduceFloor(x, k1_log10of2, kLog10of2, kLog10of2lo); xi = nonanmask & convert_to_int(j); } else { j = detail::rangeReduceFloor(x, k1_log10of2, kLog10of2, kLog10of2lo); xi = convert_to_int(j); } xi += 127; xi <<= 23; Flt powxi = bit_cast(xi); Flt y = dispenso::fast_math::hornerEval( x, 0x1.293a54p-2f, 0x1.024feap-1f, 0x1.2d9da2p0f, 0x1.0459f6p1f, 0x1.53534cp1f, 0x1.26bb18p1f, 0x1p0f); return powxi * y; } } /** * @brief Base-2 logarithm approximation. * @tparam Flt float or SIMD float type. * @tparam AccuracyTraits Default: 1 ULP. kMaxAccuracy handles denormals correctly. * kBoundsValues: returns -inf for 0, NaN for negative, +inf for +inf. * @param x Input value in (0, +inf). * @return log2(x). Compatible with all SIMD backends. */ template DISPENSO_INLINE Flt log2(Flt x) { assert_float_type(); if constexpr (!std::is_same_v>) { return log2, AccuracyTraits>(SimdType_t(x)).v; } else { auto [xi, i, m] = detail::logarithmSep(x); m = m - 1.0f; // Compute log2(1+m) for m in [sqrt(0.5)-1, sqrt(2.0)-1] // Degree-9 polynomial with c0=0 absorbed: the Horner chain evaluates c9..c1, // then fma(y, m, m) applies the final y*m + m (since c0=0, c1 is folded into +m). Flt y = dispenso::fast_math::hornerEval( m, -1.09985352e-1f, 1.86182275e-1f, -1.91066533e-1f, 2.04593703e-1f, -2.39627063e-1f, 2.88573444e-1f, -3.60695332e-1f, 4.80897635e-1f, -7.21347392e-1f, 4.42695051e-1f); y = FloatTraits::fma(y, m, m); // c0=0 absorbed: y*m + m y = y + i; /* Check for and handle special cases */ if constexpr (AccuracyTraits::kBoundsValues) { y = detail::logarithmBounds(x, y, xi); } return y; } } /** * @brief Natural logarithm approximation. * @tparam Flt float or SIMD float type. * @tparam AccuracyTraits Default: 2 ULP. kMaxAccuracy handles denormals correctly. * kBoundsValues: returns -inf for 0, NaN for negative, +inf for +inf. * @param x Input value in (0, +inf). * @return ln(x). Compatible with all SIMD backends. */ template DISPENSO_INLINE Flt log(Flt x) { assert_float_type(); if constexpr (!std::is_same_v>) { return log, AccuracyTraits>(SimdType_t(x)).v; } else { auto [xi, i, m] = detail::logarithmSep(x); m = m - 1.0f; // Compute log(1+m) for m in [sqrt(0.5)-1, sqrt(2.0)-1] // Degree-9 polynomial with c0=0, c1=1 absorbed: y = P(m)*m² + m. // P(m) evaluates c9..c2 (8 coefficients), then fma(P, m², m) applies c1=1 and c0=0. Flt y = dispenso::fast_math::hornerEval( m, 0.0924733654f, -0.14482744f, 0.148145974f, -0.165455937f, 0.199700251f, -0.250024557f, 0.333337069f, -0.499999911f); y = FloatTraits::fma(y, m * m, m); // c0=0, c1=1 absorbed y = FloatTraits::fma(i, kLn2, y); /* Check for and handle special cases */ if constexpr (AccuracyTraits::kBoundsValues) { y = detail::logarithmBounds(x, y, xi); } return y; } } /** * @brief Base-10 logarithm approximation. * @tparam Flt float or SIMD float type. * @tparam AccuracyTraits Default: 3 ULP. kMaxAccuracy handles denormals correctly. * kBoundsValues: returns -inf for 0, NaN for negative, +inf for +inf. * @param x Input value in (0, +inf). * @return log10(x). Compatible with all SIMD backends. */ template DISPENSO_INLINE Flt log10(Flt x) { assert_float_type(); if constexpr (!std::is_same_v>) { return log10, AccuracyTraits>(SimdType_t(x)).v; } else { auto [xi, i, m] = detail::logarithmSep(x); m = m - 1.0f; // Compute log10(1+m) for m in [sqrt(0.5)-1, sqrt(2.0)-1] auto fma = FloatTraits::fma; // Degree-9 polynomial with c0=0 absorbed: y = P(m) * m, where // P(m) = c9*m^8 + ... + c1 (9 coefficients). Flt y = dispenso::fast_math::hornerEval( m, 0.0326662175f, -0.0601400957f, 0.0659840927f, -0.0723559037f, 0.086600922f, -0.108560629f, 0.144770443f, -0.217147425f, 0.434294462f) * m; constexpr float kLog10of2 = 0.30102999566398f; y = fma(i, kLog10of2, y); /* Check for and handle special cases */ if constexpr (AccuracyTraits::kBoundsValues) { y = detail::logarithmBounds(x, y, xi); } return y; } } /** * @brief Two-argument arc tangent approximation. * @tparam Flt float or SIMD float type. * @tparam AccuracyTraits Default: 3 ULP. kMaxAccuracy: 2 ULP (degree-11 polynomial). * kBoundsValues: handles the case where both arguments are inf. * @param y Y coordinate (all float domain). * @param x X coordinate (all float domain). * @return atan2(y, x) in radians. Compatible with all SIMD backends. */ template DISPENSO_INLINE Flt atan2(Flt y, Flt x) { assert_float_type(); if constexpr (!std::is_same_v>) { return atan2, AccuracyTraits>(SimdType_t(y), SimdType_t(x)).v; } else { using UintT = UintType_t; UintT yi = bit_cast(y); UintT xi = bit_cast(x); auto someNonzero = ((yi | xi) & 0x7fffffff) != 0; UintT xsgn = xi & 0x80000000; UintT ysgn = yi & 0x80000000; UintT allsgn = xsgn ^ ysgn; yi &= 0x7fffffff; xi &= 0x7fffffff; auto flip = yi > xi; if constexpr (AccuracyTraits::kBoundsValues) { using IntT = IntType_t; auto yinf = yi == UintT(0x7f800000u); auto bothinf = (yinf & (xi == UintT(0x7f800000u))); auto bothinfi = bool_as_mask(bothinf); yi = FloatTraits::conditional(bothinfi, UintT(0x7f7fffffu), yi); xi = FloatTraits::conditional(bothinfi, UintT(0x7f7fffffu), xi); // keepsgn = NOT(yinf AND NOT bothinf) = bothinf OR NOT yinf. auto keepsgn = !((!bothinf) & yinf); xsgn &= bool_as_mask(keepsgn); } Flt den = bit_cast(FloatTraits::conditional(flip, yi, xi)); Flt num = bit_cast(FloatTraits::conditional(flip, xi, yi)); Flt y_x = bool_apply_or_zero(someNonzero, num / den); auto z = detail::atan_poly(y_x); z = FloatTraits::conditional(flip, kPi_2 - z, z); z = bit_cast(bit_cast(z) | allsgn); // Branchless offset: π where x<0, 0 where x≥0, with y's sign applied. UintT x_neg_mask = UintT(0) - (xsgn >> 31); Flt pi_or_zero = bit_cast(x_neg_mask & bit_cast(Flt(kPi))); Flt offset = bit_cast(bit_cast(pi_or_zero) | ysgn); return z + offset; } } /** * @brief Hypotenuse: sqrt(x*x + y*y) with overflow protection. * @tparam Flt float or SIMD float type. * @tparam AccuracyTraits Default: ~1 ULP for all normal floats. * kBoundsValues: handles inf/NaN per IEEE 754 (hypot(inf,NaN) = +inf). * kMaxAccuracy: accepted but has no additional effect. * @param x First input (all float domain). * @param y Second input (all float domain). * @return sqrt(x*x + y*y). Compatible with all SIMD backends. */ template DISPENSO_INLINE Flt hypot(Flt x, Flt y) { assert_float_type(); if constexpr (!std::is_same_v>) { return hypot, AccuracyTraits>(SimdType_t(x), SimdType_t(y)).v; } else if constexpr (std::is_same_v) { if constexpr (AccuracyTraits::kBoundsValues) { uint32_t ax = bit_cast(x) & 0x7fffffffu; uint32_t ay = bit_cast(y) & 0x7fffffffu; if (DISPENSO_EXPECT(ax == 0x7f800000u || ay == 0x7f800000u, 0)) { return std::numeric_limits::infinity(); } if (DISPENSO_EXPECT(ax > 0x7f800000u || ay > 0x7f800000u, 0)) { return std::numeric_limits::quiet_NaN(); } } // Scalar: cast to double for correctly-rounded result. double xd = static_cast(x); double yd = static_cast(y); return static_cast(std::sqrt(std::fma(xd, xd, yd * yd))); } else { // SIMD: dynamically scale inputs based on their exponent to keep // intermediates in normal float range. Extracts the exponent of // max(|x|,|y|), builds scale = 2^(127-E) so the scaled max is in [1,2). // Clamp ensures the scale itself stays in normal range. using IntT = IntType_t; using UintT = UintType_t; Flt abs_mask = bit_cast(UintT(0x7fffffffu)); Flt max_abs = FloatTraits::max(x & abs_mask, y & abs_mask); // Clamp to [FLT_MIN, 2^126] so scale and unscale stay normal. Flt clamped = FloatTraits::max(FloatTraits::min(max_abs, Flt(0x1p126f)), Flt(0x1p-126f)); IntT clamped_exp = bit_cast(clamped) & IntT(0x7f800000); // scale = 2^(127-E), unscale = 2^(E-127). Flt scale = bit_cast(IntT(0x7f000000) - clamped_exp); Flt xs = x * scale; Flt ys = y * scale; auto fma = FloatTraits::fma; Flt h = FloatTraits::sqrt(fma(xs, xs, ys * ys)); // Newton refinement: h' = h + (xs²+ys²-h²) * rcp(2h) Flt r = fma(xs, xs, fma(ys, ys, -(h * h))); Flt rcp_2h = FloatTraits::rcp(h + h + Flt(std::numeric_limits::min())); h = fma(r, rcp_2h, h); Flt unscale = bit_cast(clamped_exp); h = h * unscale; if constexpr (AccuracyTraits::kBoundsValues) { // IEEE 754: hypot(±inf, y) = +inf even when y is NaN. // Inf inputs produce NaN through Newton refinement, so override here. UintT ax = bit_cast(x) & UintT(0x7fffffffu); UintT ay = bit_cast(y) & UintT(0x7fffffffu); auto either_inf = (ax == UintT(0x7f800000u)) | (ay == UintT(0x7f800000u)); auto inf_mask = bool_as_mask(either_inf); h = FloatTraits::conditional(inf_mask, bit_cast(UintT(0x7f800000u)), h); } return h; } } /** * @brief Power function: x^y. * @tparam Flt float or SIMD float type. * @tparam AccuracyTraits Default: ~1 ULP for scalar float (double-precision core), * error scales as |y*log2(x)|*ln2 ULP for SIMD (~1-2 for moderate y). * kMaxAccuracy: uses extended-precision log2 for ~1-2 ULP even for large y (SIMD only; * scalar float always uses the double-precision core which subsumes this). * kBoundsValues: full IEEE 754 special-case handling (NaN, Inf, zero, negative bases). * @param x Base (all float domain, including negative). * @param y Exponent (SIMD vector). * @return x^y. Negative x: returns -|x|^y for odd integer y, NaN for non-integer y. * Compatible with all SIMD backends. * * Scalar float uses a table-based double-precision core (~2.5ns). * SIMD uses float-precision exp2/log2 with FMA error-free transform. */ template DISPENSO_INLINE Flt pow(Flt x, NonDeduced y) { assert_float_type(); if constexpr (!std::is_same_v>) { return pow, AccuracyTraits>(SimdType_t(x), SimdType_t(y)).v; } else if constexpr (std::is_same_v) { // Scalar float: double-precision pow core (table-based log2 + exp2). // Uses integer checkint() for parity (~5 insns vs ~40 for float_is_int/odd). // Branchless sign XOR. Table entry i=9 is {1.0, 0.0} so pow(1,y) = 1 exactly. uint32_t ix = bit_cast(x); uint32_t iy = bit_cast(y); // sign_bias is 0x80000000u for odd-integer y with negative x, else 0. // Always XORed into the result (no-op when 0). uint32_t sign_bias = 0; // neg_nan: set if negative x with non-integer y (domain error → NaN). // Only used by the non-bounds path (deferred after core computation). uint32_t neg_nan = 0; if constexpr (AccuracyTraits::kBoundsValues) { // Combined fast-path test: detects any x or y that needs special handling. // First term: ix - 0x00800000 >= 0x7F000000 is true when ix < 0x00800000 // (zero, subnormal) or ix >= 0x7F800000 (inf, NaN) — but also when ix // has the sign bit set (negative), because unsigned subtraction wraps // to a huge value. This means negative x enters the special block too, // avoiding a separate negative-x check. // Second term: 2u*iy - 1u >= 0xFEFFFFFF tests y = ±0, ±inf, or NaN // (see bit trick comments in pow_scalar_y_special). if (DISPENSO_EXPECT( ix - 0x00800000u >= 0x7f800000u - 0x00800000u || 2u * iy - 1u >= 2u * 0x7f800000u - 1u, 0)) { float special; if (2u * iy - 1u >= 2u * 0x7f800000u - 1u) { detail::pow_scalar_y_special(x, y, ix, iy, &special); return special; } if (detail::pow_scalar_x_special(ix, iy, sign_bias, &special)) return special; // Fall through to core computation with adjusted ix. } } else { // Non-bounds path: one branch for the common case (positive x), // branchless checkint in the else (rare negative-x path avoids further mispredicts). if (DISPENSO_EXPECT(!(ix & 0x80000000u), 1)) { // Positive x: nothing to do. } else { ix &= 0x7fffffffu; int32_t yint = detail::checkint_branchless(iy); sign_bias = static_cast(yint == 2) * 0x80000000u; neg_nan = static_cast(yint == 0); } } float r; if constexpr (AccuracyTraits::kMaxAccuracy) { // Tableless core: logarithmSep handles subnormals internally (pre-scales // by 2^23), so pass the original |x| — not the bounds-adjusted ix. r = detail::pow_double_poly_core(std::fabs(x), y); } else { // Table core: uses ix directly. The bounds path above already normalized // subnormals (scale by 2^23, subtract 23 from exponent) so ix is valid. r = detail::pow_double_core(bit_cast(ix), y); } // Branchless sign flip (no-op when sign_bias == 0). r = bit_cast(bit_cast(r) ^ sign_bias); // Non-bounds: apply deferred NaN for neg^non-int, and pow(x,0)=1. if constexpr (!AccuracyTraits::kBoundsValues) { if (neg_nan) r = std::numeric_limits::quiet_NaN(); if (y == 0.0f) r = 1.0f; } return r; } else { // SIMD path: double-precision core for all accuracy levels (~1 ULP). using UintT = UintType_t; Flt ax = fabs(x); Flt r; // Width-dependent dispatch: table gathers are cheap at 4 lanes but // scale poorly; polynomial cost amortizes over more lanes. if constexpr (sizeof(Flt) <= 16) { r = detail::pow_double_hybrid_core(ax, y); } else { r = detail::pow_double_poly_core(ax, y); } if constexpr (AccuracyTraits::kBoundsValues) { detail::pow_simd_special_fixup(ax, y, r); } // Sign handling for negative x (SIMD, branchless per-lane): // The double core computes pow(|x|, y). For negative x we need: // - odd integer y: negate result (e.g. pow(-2, 3) = -8) // - even integer y: keep positive (e.g. pow(-2, 2) = 4) // - non-integer y: result is NaN (domain error) // float_is_int/float_is_odd test this via mantissa bit inspection // (branchless SIMD comparisons, no scalar checkint needed). UintT xsign = bit_cast(x) & UintT(0x80000000u); auto negx = x < 0.0f; auto y_int = float_is_int(y); auto y_odd = float_is_odd(y); // sign_mask is 0x80000000 in lanes where x < 0 AND y is odd, else 0. // XORing into r negates exactly those lanes. UintT sign_mask = xsign & bool_as_mask(y_odd); r = bit_cast(bit_cast(r) ^ sign_mask); // Replace with NaN where x < 0 and y is not an integer. constexpr float kNaN = std::numeric_limits::quiet_NaN(); r = FloatTraits::conditional(negx & !y_int, Flt(kNaN), r); // pow(x, 0) = 1 for all x (IEEE 754). r = FloatTraits::conditional(y == 0.0f, Flt(1.0f), r); if constexpr (AccuracyTraits::kBoundsValues) { // pow(1, y) = 1 even for NaN y. r = FloatTraits::conditional(x == 1.0f, Flt(1.0f), r); // pow(-1, ±inf) = 1. constexpr float kInf = std::numeric_limits::infinity(); auto x_neg1 = (x == -1.0f); auto y_inf = fabs(y) == Flt(kInf); r = FloatTraits::conditional(x_neg1 & y_inf, Flt(1.0f), r); } return r; } } /** * @brief Power function with scalar exponent: x^y. * @tparam Flt float or SIMD float type. * @tparam AccuracyTraits Same as pow(Flt, Flt). * @param x Base (all float domain, including negative). * @param y Scalar exponent — enables scalar branching optimizations. * @return x^y. Same semantics as pow(Flt, Flt). * Compatible with all SIMD backends. * * Scalar y enables fast paths: y=0→1, y=1→x, y=-1→1/x, y=0.5→sqrt, * y=2→x*x, integer y→binary squaring. General case uses exp2/log2. */ template < typename Flt, typename AccuracyTraits = DefaultAccuracyTraits, typename = std::enable_if_t>> DISPENSO_INLINE Flt pow(Flt x, float y) { assert_float_type(); if constexpr (!std::is_same_v>) { return pow, AccuracyTraits>(SimdType_t(x), y).v; } else { using UintT = UintType_t; // Scalar fast paths: y=0→1, y=1→x, y=-1→1/x, y=0.5→sqrt, y=2→x*x, // small integer y → binary squaring. bool y_is_int = false, y_is_odd = false; Flt fast_result; if (detail::pow_scalar_y_fast_path(x, y, y_is_int, y_is_odd, fast_result)) return fast_result; Flt ax = fabs(x); Flt r; Flt yv(y); // Double-precision core for all SIMD accuracy levels (~1 ULP). if constexpr (sizeof(Flt) <= 16) { r = detail::pow_double_hybrid_core(ax, yv); } else { r = detail::pow_double_poly_core(ax, yv); } if constexpr (AccuracyTraits::kBoundsValues) { detail::pow_simd_special_fixup(ax, yv, r); } // Sign handling for negative x. if (y_is_odd) { UintT xsign = bit_cast(x) & UintT(0x80000000u); r = bit_cast(bit_cast(r) ^ xsign); } if (!y_is_int) { auto negx = x < 0.0f; constexpr float kNaN = std::numeric_limits::quiet_NaN(); r = FloatTraits::conditional(negx, Flt(kNaN), r); } if constexpr (AccuracyTraits::kBoundsValues) { r = FloatTraits::conditional(x == 1.0f, Flt(1.0f), r); constexpr float kInf = std::numeric_limits::infinity(); if (y == kInf || y == -kInf) { auto x_neg1 = (x == -1.0f); r = FloatTraits::conditional(x_neg1, Flt(1.0f), r); } } return r; } } /** * @brief Compute exp(x) - 1 with precision near zero. * @tparam Flt float or SIMD float type. * @tparam AccuracyTraits Default: ~2 ULP (1-step CW + degree-4 poly). * MaxAccuracy: 1 ULP (2-step CW + degree-5 poly). * kBoundsValues: 1 ULP + handles NaN/inf. * @param x Input value (all float domain). * @return exp(x) - 1. Compatible with all SIMD backends. * * Uses Cody-Waite range reduction: x = n*ln2 + r, |r| <= ln2/2. * Then expm1(x) = 2^n * expm1(r) + (2^n - 1), where expm1(r) is computed * via a Sollya fpminimax polynomial. Avoids the catastrophic cancellation of * exp(x) - 1 near zero, and provides uniform accuracy across the entire * domain (no polynomial/fallback transition). */ template DISPENSO_INLINE Flt expm1(Flt x) { assert_float_type(); if constexpr (!std::is_same_v>) { return expm1, AccuracyTraits>(SimdType_t(x)).v; } else { using IntT = IntType_t; using UintT = UintType_t; auto fma_fn = FloatTraits::fma; // Cody-Waite range reduction: x = n*ln2 + r, |r| <= ln2/2. // 2-step CW is required for all accuracy levels because the // reconstruction 2^n * expm1(r) amplifies reduction error by 2^n. constexpr float k1_ln2 = k1_Ln2; constexpr float kLn2hi = 6.93145752e-1f; constexpr float kLn2lo = 1.42860677e-6f; Flt r = x; Flt jf = detail::rangeReduce(r, k1_ln2, kLn2hi, kLn2lo); IntT n = convert_to_int(jf); Flt em1_r; if constexpr (AccuracyTraits::kMaxAccuracy || AccuracyTraits::kBoundsValues) { // Degree-5 Sollya fpminimax((exp(x)-1-x)/x^2, 5, [|SG...|], [-ln2/2, ln2/2]): // sup-norm error < 2^-29 (~0.03 ULP at r = ln2/2). Flt p = dispenso::fast_math::hornerEval( r, 0x1.a26762p-13f, 0x1.6d2ep-10f, 0x1.110ff2p-7f, 0x1.555502p-5f, 0x1.555556p-3f, 0x1p-1f); em1_r = fma_fn(p, r * r, r); } else { // Degree-4 Sollya fpminimax((exp(x)-1-x)/x^2, 4, [|SG...|], [-ln2/2, ln2/2]): // sup-norm error < 2^-24 (~1.2 ULP at r = ln2/2). Flt p = dispenso::fast_math::hornerEval( r, 0x1.6ca992p-10f, 0x1.120abep-7f, 0x1.55556cp-5f, 0x1.5554dep-3f, 0x1p-1f); em1_r = fma_fn(p, r * r, r); } // Reconstruction: expm1(x) = 2^n * expm1(r) + (2^n - 1). // 2^n is exact for integer n. 2^n - 1 is exact for |n| <= 23. // For |n| > 24, 2^n - 1 = 2^n in float, so expm1 ≈ exp which is correct. // Use fma for precision: fma(2^n, expm1(r), 2^n - 1). Flt two_n = bit_cast(UintT(n + 127) << 23); Flt two_n_m1 = two_n - 1.0f; Flt result = fma_fn(two_n, em1_r, two_n_m1); if constexpr (std::is_same_v) { // For n == 0 (|x| < ln2/2 ≈ 0.347): return polynomial directly. if (n == 0) return em1_r; // For x < -25*ln2 ≈ -17.3: expm1(x) rounds to -1 in float. if (x < -17.5f) return -1.0f; // For x > 89: exp(x) overflows, expm1(x) = inf. if (x > 89.0f) return std::numeric_limits::infinity(); // NaN propagation: rangeReduce maps NaN to finite r, producing a // garbage result. x - x is 0 for finite, NaN for NaN. if constexpr (AccuracyTraits::kBoundsValues) return result + (x - x); return result; } else { // SIMD: blend n==0 path (direct polynomial) with reconstruction. auto zero_n = (n == IntT(0)); result = FloatTraits::conditional(zero_n, em1_r, result); // Clamp: x < -17.5 → -1, x > 89 → inf. result = FloatTraits::conditional(x < -17.5f, Flt(-1.0f), result); constexpr float kInf = std::numeric_limits::infinity(); result = FloatTraits::conditional(x > 89.0f, Flt(kInf), result); if constexpr (AccuracyTraits::kBoundsValues) { // NaN propagation: range reduction maps NaN to finite values. auto is_nan = (bit_cast(x) & UintT(0x7fffffffu)) > UintT(0x7f800000u); result = FloatTraits::conditional(is_nan, x, result); } return result; } } } /** * @brief Compute log(1 + x) with precision near zero. * @tparam Flt float or SIMD float type. * @tparam AccuracyTraits Default: ~2 ULP. kBoundsValues: handles x = -1 (→ -inf), NaN. * @param x Input value in (-1, +inf). * @return log(1 + x). Compatible with all SIMD backends. * * For |x| < 0.25: direct Sollya polynomial avoids the log() call (~1 ULP). * For |x| >= 0.25: compensated-addition trick: u = 1 + x, c = x - (u - 1) * captures the rounding error. Then log(u) is computed inline via the same * range reduction as log() (logarithmSep), and the result is log(u) + c/u. */ template DISPENSO_INLINE Flt log1p(Flt x) { assert_float_type(); if constexpr (!std::is_same_v>) { return log1p, AccuracyTraits>(SimdType_t(x)).v; } else { auto fma_fn = FloatTraits::fma; // Direct polynomial for log1p(x) on [-0.25, 0.25]: // log1p(x) = x - x²/2 + x³/3 - ... ≈ x + x² * q(x) // Sollya fpminimax((log(1+x)-x)/x^2, 6, [|SG...|], [-0.25, 0.25]): // sup-norm error < 2^-23; at x=0.25 → ~0.25 ULP polynomial error. Flt q = dispenso::fast_math::hornerEval( x, -0x1.1957b2p-3f, 0x1.3f347cp-3f, -0x1.5472d2p-3f, 0x1.98bfaap-3f, -0x1.000112p-2f, 0x1.555632p-2f, -0x1p-1f); Flt poly_r = fma_fn(q, x * x, x); // log1p(x) ≈ x + x² * q(x) if constexpr (std::is_same_v) { // Scalar: branch on magnitude. if (std::fabs(x) < 0.25f) return poly_r; } // Compensated addition: u = float(1 + x), c = rounding error. // log(1 + x) = log(u + c) = log(u) + log(1 + c/u) ≈ log(u) + c/u. Flt u = Flt(1.0f) + x; Flt c = x - (u - 1.0f); // Inline log(u): range-reduce u via logarithmSep, evaluate polynomial. auto [xi, i, m] = detail::logarithmSep(u); m = m - 1.0f; // log(1+m) polynomial for m in [sqrt(0.5)-1, sqrt(2.0)-1]. // Same coefficients as log(). Degree-9 with c0=0, c1=1 absorbed. // p(m) = m + m² * P(m). polyEval uses Estrin on CPU (~27% faster), // Horner on GPU (better for in-order pipelines). Flt inner = dispenso::fast_math::polyEval( m, 0.0924733654f, -0.14482744f, 0.148145974f, -0.165455937f, 0.199700251f, -0.250024557f, 0.333337069f, -0.499999911f); Flt y = fma_fn(inner, m * m, m); // c0=0, c1=1 absorbed y = fma_fn(i, kLn2, y); // Add compensation: log1p(x) = log(u) + c/u. Flt result = y + c / u; if constexpr (!std::is_same_v) { // SIMD: blend polynomial for small |x|, compensated log otherwise. auto small = fabs(x) < 0.25f; result = FloatTraits::conditional(small, poly_r, result); } if constexpr (AccuracyTraits::kBoundsValues) { // log1p(-1) → -inf, log1p(x < -1) → NaN, log1p(NaN) → NaN. result = detail::logarithmBounds(u, result, xi); } return result; } } /** * @brief Hyperbolic tangent approximation. * @tparam Flt float or SIMD float type. * @tparam AccuracyTraits Default: ~2 ULP. kBoundsValues: handles NaN. * @param x Input value (all float domain). * @return tanh(x) in [-1, 1]. Compatible with all SIMD backends. * * tanh(x) = expm1(2x) / (expm1(2x) + 2). * The range-reduced expm1 preserves precision near zero (expm1(2x) ≈ 2x), * so this formula naturally gives tanh(x) ≈ x for small x without a * separate polynomial. Input clamped to [-10, 10] since tanh(10) = 1.0f * exactly in float, and the clamp prevents expm1 overflow for large |x|. */ template DISPENSO_INLINE Flt tanh(Flt x) { assert_float_type(); if constexpr (!std::is_same_v>) { return tanh, AccuracyTraits>(SimdType_t(x)).v; } else if constexpr (std::is_same_v) { // Scalar: Sollya fpminimax polynomial for |x| < 1.0 (degree 6 in u = x²). // tanh(x) = x * (1 + u * Q(u)), where Q is fitted to tanh(x)/x - 1. // Error: ~0.52 ULP at the boundary. 6 FMAs. float ax = std::fabs(x); if (!(ax >= 1.0f)) { // !(>=) so NaN enters polynomial path and propagates constexpr float t1 = -0x1.555482p-2f; constexpr float t2 = 0x1.10ef12p-3f; constexpr float t3 = -0x1.b66cecp-5f; constexpr float t4 = 0x1.4e3a6ap-6f; constexpr float t5 = -0x1.9c954p-8f; constexpr float t6 = 0x1.189ec2p-10f; float u = x * x; float poly = std::fma(std::fma(std::fma(std::fma(std::fma(t6, u, t5), u, t4), u, t3), u, t2), u, t1); return x * std::fma(poly, u, 1.0f); } // Large |x|: use expm1 formula (no cancellation, result far from 0). float x_safe = ax < 10.0f ? x : (x < 0.0f ? -10.0f : 10.0f); float em1 = expm1(x_safe + x_safe); return em1 / (em1 + 2.0f); } else { // SIMD: pure expm1 formula (branchless, uniform across all lanes). Flt x_safe = clamp_no_nan(x, Flt(-10.0f), Flt(10.0f)); Flt em1 = expm1(x_safe + x_safe); Flt result = em1 / (em1 + 2.0f); if constexpr (AccuracyTraits::kBoundsValues) { // NaN propagation: clamp_no_nan may map NaN to a finite value. using UintT = UintType_t; auto is_nan = (bit_cast(x) & UintT(0x7fffffffu)) > UintT(0x7f800000u); result = FloatTraits::conditional(is_nan, x, result); } return result; } } /** * @brief Error function approximation. * @tparam Flt float or SIMD float type. * @tparam AccuracyTraits Default: ~2 ULP. Not used for bounds (NaN propagates naturally). * @param x Input value (all float domain). * @return erf(x) in [-1, 1]. Compatible with all SIMD backends. * * Abramowitz & Stegun 7.1.26-inspired t-substitution with Sollya-optimized * coefficients. Uses p=0.45 (vs A&S's 0.3275911) for better polynomial fit. * * Two domains: * |x| < 0.875: erf(x) = x * (c0 + x² * Q(x²)), pure polynomial. * |x| ∈ [0.875, 3.92]: erf(x) = 1 - t·P(t)·exp(-x²), t = 1/(1+px). * |x| >= 3.92: erf(x) = ±1 (saturated in float). */ template DISPENSO_INLINE Flt erf(Flt x) { assert_float_type(); // AccuracyTraits accepted for API consistency; erf uses the same polynomial for all traits. (void)sizeof(AccuracyTraits); if constexpr (!std::is_same_v>) { return erf, AccuracyTraits>(SimdType_t(x)).v; } else if constexpr (std::is_same_v) { // Scalar path: branching for efficiency. // Save sign and work with |x|; restore sign at end via bit-OR. uint32_t sign = bit_cast(x) & 0x80000000u; float ax = dispenso::fast_math::fabs(x); float result; if (ax >= 3.92f) { result = 1.0f; } else if (!(ax < 0.875f)) { // !(< ) so NaN goes to erfc path and propagates // erfc formula: erf(x) = 1 - t * P(t) * exp(-x²), t = 1/(1+p*x). constexpr float p = 0.45f; float t = 1.0f / std::fma(p, ax, 1.0f); // Sollya fpminimax degree 5 P(t), float coefficients. constexpr float c0 = 0x1.04873ep-2f; constexpr float c1 = 0x1.f81fc6p-3f; constexpr float c2 = 0x1.189f42p-2f; constexpr float c3 = 0x1.15aaa6p-5f; constexpr float c4 = 0x1.65d24ep-2f; constexpr float c5 = -0x1.4432a4p-3f; float poly = t * std::fma(std::fma(std::fma(std::fma(std::fma(c5, t, c4), t, c3), t, c2), t, c1), t, c0); // Inline exp(-x²) via Cody-Waite range reduction. float u = ax * ax; constexpr float kLog2e = 0x1.715476p+0f; constexpr float kLn2hi = 0x1.62e400p-1f; constexpr float kLn2lo = 0x1.7f7d1cp-20f; float k = std::floor(u * kLog2e); float f = std::fma(k, -kLn2hi, u); f = std::fma(k, -kLn2lo, f); // Degree-5 Horner for exp(-f), f in [0, ln2). constexpr float e0 = 0x1.fffffep-1f, e1 = -0x1.ffff1ep-1f; constexpr float e2 = 0x1.ffe314p-2f, e3 = -0x1.53f876p-3f; constexpr float e4 = 0x1.462f16p-5f, e5 = -0x1.80e5b2p-8f; float exp_neg_f = std::fma(std::fma(std::fma(std::fma(std::fma(e5, f, e4), f, e3), f, e2), f, e1), f, e0); int32_t ki = convert_to_int_trunc_safe(k); float pow2_neg_k = bit_cast((127 - ki) << 23); result = 1.0f - pow2_neg_k * exp_neg_f * poly; } else { // Near-zero: erf(x) = x * (c0 + x² * Q(x²)). // Sollya fpminimax degree 5 Q(u), float coefficients. constexpr float c0 = 0x1.20dd76p+0f; // ≈ 2/√π constexpr float q0 = -0x1.812746p-2f, q1 = 0x1.ce2ec6p-4f, q2 = -0x1.b81edep-6f; constexpr float q3 = 0x1.556b48p-8f, q4 = -0x1.b0255p-11f, q5 = 0x1.7149c8p-14f; float u = ax * ax; float q = std::fma(std::fma(std::fma(std::fma(std::fma(q5, u, q4), u, q3), u, q2), u, q1), u, q0); result = ax * std::fma(q, u, c0); } return bit_cast(bit_cast(result) | sign); } else { // SIMD: branchless, compute both paths for all lanes and blend. using UintT = UintType_t; using IntT = IntType_t; auto fma_fn = FloatTraits::fma; // clamp_allow_nan preserves NaN so it propagates through the computation. Flt ax = clamp_allow_nan(fabs(x), Flt(0.0f), Flt(3.92f)); // --- Near-zero path: erf(x) = x * (c0 + x² * Q(x²)) --- Flt u_near = ax * ax; constexpr float c0_near = 0x1.20dd76p+0f; Flt q_near = hornerEval( u_near, 0x1.7149c8p-14f, -0x1.b0255p-11f, 0x1.556b48p-8f, -0x1.b81edep-6f, 0x1.ce2ec6p-4f, -0x1.812746p-2f); Flt near_result = ax * fma_fn(q_near, u_near, Flt(c0_near)); // --- erfc path: erf(x) = 1 - t * P(t) * exp(-x²) --- constexpr float p = 0.45f; Flt denom = fma_fn(Flt(p), ax, Flt(1.0f)); Flt t = Flt(1.0f) / denom; Flt erfc_poly = t * hornerEval(t, -0x1.4432a4p-3f, 0x1.65d24ep-2f, 0x1.15aaa6p-5f, 0x1.189f42p-2f, 0x1.f81fc6p-3f, 0x1.04873ep-2f); // Inline exp(-x²) via Cody-Waite range reduction. Flt u = ax * ax; constexpr float kLog2e = 0x1.715476p+0f; constexpr float kLn2hi = 0x1.62e400p-1f; constexpr float kLn2lo = 0x1.7f7d1cp-20f; Flt k = floor_small(u * kLog2e); Flt f = fma_fn(k, Flt(-kLn2hi), u); f = fma_fn(k, Flt(-kLn2lo), f); Flt exp_neg_f = hornerEval( f, -0x1.80e5b2p-8f, 0x1.462f16p-5f, -0x1.53f876p-3f, 0x1.ffe314p-2f, -0x1.ffff1ep-1f, 0x1.fffffep-1f); IntT ki = convert_to_int(k); Flt pow2_neg_k = bit_cast(UintT((IntT(127) - ki) << 23)); Flt erfc_result = Flt(1.0f) - pow2_neg_k * exp_neg_f * erfc_poly; // Blend: near-zero for |x| < 0.875, erfc otherwise. auto small = ax < Flt(0.875f); Flt result = FloatTraits::conditional(small, near_result, erfc_result); // Restore sign from original x (odd function). Result is non-negative, so OR stamps sign. result = bit_cast(bit_cast(result) | (bit_cast(x) & UintT(0x80000000u))); return result; } } } // namespace fast_math } // namespace dispenso ================================================ FILE: dispenso/fast_math/float_traits.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #if defined(__SSE__) #include #elif defined(__aarch64__) #include #endif #include namespace dispenso { namespace fast_math { // Float-precision math constants, avoiding repeated static_cast(M_PI) etc. constexpr float kPi = static_cast(M_PI); constexpr float kPi_2 = static_cast(M_PI_2); constexpr float kPi_4 = static_cast(M_PI_4); constexpr float kLn2 = static_cast(M_LN2); constexpr float k1_Ln2 = static_cast(1.0 / M_LN2); template struct FloatTraits {}; template <> struct FloatTraits { using IntType = int32_t; using UintType = uint32_t; using BoolType = bool; static constexpr uint32_t kOne = 0x3f800000; static constexpr float kMagic = 12582912.f; // 1.5 * 2**23 static constexpr bool kBoolIsMask = false; static DISPENSO_INLINE float sqrt(float x) { return std::sqrt(x); } static DISPENSO_INLINE float rcp(float x) { #if defined(__SSE__) return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(x))); #elif defined(__aarch64__) return vrecpes_f32(x); #else return 1.0f / x; #endif } template static DISPENSO_INLINE Arg conditional(bool b, Arg x, Arg y) { return b ? x : y; } template static DISPENSO_INLINE Arg apply(bool b, Arg x) { // For kBoolIsMask cases, this is b & x return b * x; } static DISPENSO_INLINE float min(float a, float b) { return std::min(a, b); } static DISPENSO_INLINE float max(float a, float b) { return std::max(a, b); } static DISPENSO_INLINE float fma(float a, float b, float c) { return std::fma(a, b, c); } }; template <> struct FloatTraits { using IntType = int32_t; }; template <> struct FloatTraits { using IntType = uint32_t; }; // Non-deduced context helper: prevents template argument deduction on a parameter. // Use as function parameter type to force callers to rely on deduction from other args. template struct NonDeducedHelper { using type = T; }; template using NonDeduced = typename NonDeducedHelper::type; // Maps raw SIMD intrinsic types to their wrapper types for template deduction. // Default (identity): scalar types and wrapper types map to themselves. // Specializations in backend headers map __m128 → SseFloat, __m256 → AvxFloat, etc. // This enables fm::sin(__m128_val) to work via automatic forwarding. template struct SimdTypeFor { using type = T; }; template using SimdType_t = typename SimdTypeFor::type; template using IntType_t = typename FloatTraits>::IntType; template using UintType_t = typename FloatTraits>::UintType; template using BoolType_t = typename FloatTraits>::BoolType; } // namespace fast_math } // namespace dispenso ================================================ FILE: dispenso/fast_math/float_traits_avx.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #if defined(__AVX2__) #include #include #include "float_traits.h" #include "util.h" namespace dispenso { namespace fast_math { struct AvxInt32; struct AvxUint32; // 8-wide float SIMD wrapper around __m256. struct AvxFloat { __m256 v; AvxFloat() = default; AvxFloat(__m256 vec) : v(vec) {} // Implicit broadcast from scalar — required for polynomial constant propagation. AvxFloat(float f) : v(_mm256_set1_ps(f)) {} // Explicit int→float conversion for (Flt)intExpr patterns. explicit AvxFloat(AvxInt32 i); // Implicit conversion back to raw intrinsic type. operator __m256() const { return v; } AvxFloat operator-() const { return _mm256_xor_ps(v, _mm256_set1_ps(-0.0f)); } AvxFloat& operator+=(AvxFloat o) { v = _mm256_add_ps(v, o.v); return *this; } AvxFloat& operator-=(AvxFloat o) { v = _mm256_sub_ps(v, o.v); return *this; } AvxFloat& operator*=(AvxFloat o) { v = _mm256_mul_ps(v, o.v); return *this; } AvxFloat& operator&=(AvxFloat o) { v = _mm256_and_ps(v, o.v); return *this; } // Arithmetic (non-member friends for symmetric implicit conversions). friend AvxFloat operator+(AvxFloat a, AvxFloat b) { return _mm256_add_ps(a.v, b.v); } friend AvxFloat operator-(AvxFloat a, AvxFloat b) { return _mm256_sub_ps(a.v, b.v); } friend AvxFloat operator*(AvxFloat a, AvxFloat b) { return _mm256_mul_ps(a.v, b.v); } friend AvxFloat operator/(AvxFloat a, AvxFloat b) { return _mm256_div_ps(a.v, b.v); } // Comparisons return AvxFloat masks (all-ones or all-zeros per lane). friend AvxFloat operator<(AvxFloat a, AvxFloat b) { return _mm256_cmp_ps(a.v, b.v, _CMP_LT_OQ); } friend AvxFloat operator>(AvxFloat a, AvxFloat b) { return _mm256_cmp_ps(a.v, b.v, _CMP_GT_OQ); } friend AvxFloat operator<=(AvxFloat a, AvxFloat b) { return _mm256_cmp_ps(a.v, b.v, _CMP_LE_OQ); } friend AvxFloat operator>=(AvxFloat a, AvxFloat b) { return _mm256_cmp_ps(a.v, b.v, _CMP_GE_OQ); } friend AvxFloat operator==(AvxFloat a, AvxFloat b) { return _mm256_cmp_ps(a.v, b.v, _CMP_EQ_OQ); } friend AvxFloat operator!=(AvxFloat a, AvxFloat b) { return _mm256_cmp_ps(a.v, b.v, _CMP_NEQ_UQ); } // Logical NOT of a comparison mask. friend AvxFloat operator!(AvxFloat a) { return _mm256_xor_ps(a.v, _mm256_castsi256_ps(_mm256_set1_epi32(-1))); } // Bitwise ops on float masks. friend AvxFloat operator&(AvxFloat a, AvxFloat b) { return _mm256_and_ps(a.v, b.v); } friend AvxFloat operator|(AvxFloat a, AvxFloat b) { return _mm256_or_ps(a.v, b.v); } friend AvxFloat operator^(AvxFloat a, AvxFloat b) { return _mm256_xor_ps(a.v, b.v); } }; // 8-wide int32 SIMD wrapper around __m256i. struct AvxInt32 { __m256i v; AvxInt32() = default; AvxInt32(__m256i vec) : v(vec) {} AvxInt32(int32_t i) : v(_mm256_set1_epi32(i)) {} // Reinterpret from AvxUint32 (no-op on __m256i). AvxInt32(AvxUint32 u); // Implicit conversion back to raw intrinsic type. operator __m256i() const { return v; } AvxInt32 operator-() const { return _mm256_sub_epi32(_mm256_setzero_si256(), v); } AvxInt32& operator+=(AvxInt32 o) { v = _mm256_add_epi32(v, o.v); return *this; } AvxInt32& operator-=(AvxInt32 o) { v = _mm256_sub_epi32(v, o.v); return *this; } AvxInt32& operator*=(AvxInt32 o) { v = _mm256_mullo_epi32(v, o.v); return *this; } AvxInt32& operator&=(AvxInt32 o) { v = _mm256_and_si256(v, o.v); return *this; } AvxInt32& operator|=(AvxInt32 o) { v = _mm256_or_si256(v, o.v); return *this; } // Shifts. AvxInt32 operator<<(int n) const { return _mm256_slli_epi32(v, n); } AvxInt32 operator>>(int n) const { return _mm256_srai_epi32(v, n); // Arithmetic shift right } AvxInt32& operator<<=(int n) { v = _mm256_slli_epi32(v, n); return *this; } AvxInt32& operator>>=(int n) { v = _mm256_srai_epi32(v, n); return *this; } // Arithmetic (non-member friends for symmetric implicit conversions). friend AvxInt32 operator+(AvxInt32 a, AvxInt32 b) { return _mm256_add_epi32(a.v, b.v); } friend AvxInt32 operator-(AvxInt32 a, AvxInt32 b) { return _mm256_sub_epi32(a.v, b.v); } friend AvxInt32 operator*(AvxInt32 a, AvxInt32 b) { return _mm256_mullo_epi32(a.v, b.v); } // Bitwise. friend AvxInt32 operator&(AvxInt32 a, AvxInt32 b) { return _mm256_and_si256(a.v, b.v); } friend AvxInt32 operator|(AvxInt32 a, AvxInt32 b) { return _mm256_or_si256(a.v, b.v); } friend AvxInt32 operator^(AvxInt32 a, AvxInt32 b) { return _mm256_xor_si256(a.v, b.v); } friend AvxInt32 operator~(AvxInt32 a) { return _mm256_xor_si256(a.v, _mm256_set1_epi32(-1)); } // Comparisons return AvxInt32 masks. friend AvxInt32 operator==(AvxInt32 a, AvxInt32 b) { return _mm256_cmpeq_epi32(a.v, b.v); } friend AvxInt32 operator!=(AvxInt32 a, AvxInt32 b) { return _mm256_xor_si256(_mm256_cmpeq_epi32(a.v, b.v), _mm256_set1_epi32(-1)); } friend AvxInt32 operator<(AvxInt32 a, AvxInt32 b) { return _mm256_cmpgt_epi32(b.v, a.v); // No _mm256_cmplt_epi32; use swapped cmpgt. } friend AvxInt32 operator>(AvxInt32 a, AvxInt32 b) { return _mm256_cmpgt_epi32(a.v, b.v); } friend AvxInt32 operator!(AvxInt32 a) { return _mm256_cmpeq_epi32(a.v, _mm256_setzero_si256()); } }; // 8-wide uint32 SIMD wrapper around __m256i. struct AvxUint32 { __m256i v; AvxUint32() = default; AvxUint32(__m256i vec) : v(vec) {} AvxUint32(uint32_t u) : v(_mm256_set1_epi32(static_cast(u))) {} // Reinterpret from AvxInt32 (no-op on __m256i). AvxUint32(AvxInt32 i) : v(i.v) {} // Implicit conversion back to raw intrinsic type. operator __m256i() const { return v; } AvxUint32& operator+=(AvxUint32 o) { v = _mm256_add_epi32(v, o.v); return *this; } AvxUint32& operator-=(AvxUint32 o) { v = _mm256_sub_epi32(v, o.v); return *this; } AvxUint32& operator*=(AvxUint32 o) { v = _mm256_mullo_epi32(v, o.v); return *this; } AvxUint32& operator&=(AvxUint32 o) { v = _mm256_and_si256(v, o.v); return *this; } AvxUint32& operator|=(AvxUint32 o) { v = _mm256_or_si256(v, o.v); return *this; } // Shifts. AvxUint32 operator<<(int n) const { return _mm256_slli_epi32(v, n); } AvxUint32 operator>>(int n) const { return _mm256_srli_epi32(v, n); // Logical shift right } AvxUint32& operator<<=(int n) { v = _mm256_slli_epi32(v, n); return *this; } AvxUint32& operator>>=(int n) { v = _mm256_srli_epi32(v, n); return *this; } // Arithmetic (non-member friends for symmetric implicit conversions). friend AvxUint32 operator+(AvxUint32 a, AvxUint32 b) { return _mm256_add_epi32(a.v, b.v); } friend AvxUint32 operator-(AvxUint32 a, AvxUint32 b) { return _mm256_sub_epi32(a.v, b.v); } friend AvxUint32 operator*(AvxUint32 a, AvxUint32 b) { return _mm256_mullo_epi32(a.v, b.v); } // Bitwise. friend AvxUint32 operator&(AvxUint32 a, AvxUint32 b) { return _mm256_and_si256(a.v, b.v); } friend AvxUint32 operator|(AvxUint32 a, AvxUint32 b) { return _mm256_or_si256(a.v, b.v); } friend AvxUint32 operator^(AvxUint32 a, AvxUint32 b) { return _mm256_xor_si256(a.v, b.v); } friend AvxUint32 operator~(AvxUint32 a) { return _mm256_xor_si256(a.v, _mm256_set1_epi32(-1)); } // Unsigned comparisons need XOR with sign bit to convert to signed domain. friend AvxUint32 operator==(AvxUint32 a, AvxUint32 b) { return _mm256_cmpeq_epi32(a.v, b.v); } friend AvxUint32 operator!=(AvxUint32 a, AvxUint32 b) { return _mm256_xor_si256(_mm256_cmpeq_epi32(a.v, b.v), _mm256_set1_epi32(-1)); } friend AvxUint32 operator>(AvxUint32 a, AvxUint32 b) { __m256i bias = _mm256_set1_epi32(static_cast(0x80000000u)); return _mm256_cmpgt_epi32(_mm256_xor_si256(a.v, bias), _mm256_xor_si256(b.v, bias)); } friend AvxUint32 operator<(AvxUint32 a, AvxUint32 b) { return b > a; } friend AvxUint32 operator!(AvxUint32 a) { return _mm256_cmpeq_epi32(a.v, _mm256_setzero_si256()); } }; // AvxFloat ↔ AvxInt32 conversion. inline AvxFloat::AvxFloat(AvxInt32 i) : v(_mm256_cvtepi32_ps(i.v)) {} // AvxInt32 ↔ AvxUint32 reinterpret (no-op on __m256i). inline AvxInt32::AvxInt32(AvxUint32 u) : v(u.v) {} // Map raw __m256 to AvxFloat for SimdTypeFor. template <> struct SimdTypeFor<__m256> { using type = AvxFloat; }; // --- bit_cast specializations --- template <> inline AvxInt32 bit_cast(const AvxFloat& f) noexcept { return _mm256_castps_si256(f.v); } template <> inline AvxUint32 bit_cast(const AvxFloat& f) noexcept { return _mm256_castps_si256(f.v); } template <> inline AvxFloat bit_cast(const AvxInt32& i) noexcept { return _mm256_castsi256_ps(i.v); } template <> inline AvxFloat bit_cast(const AvxUint32& u) noexcept { return _mm256_castsi256_ps(u.v); } template <> inline AvxInt32 bit_cast(const AvxUint32& u) noexcept { return u.v; } template <> inline AvxUint32 bit_cast(const AvxInt32& i) noexcept { return i.v; } // --- FloatTraits --- template <> struct FloatTraits { using IntType = AvxInt32; using UintType = AvxUint32; using BoolType = AvxFloat; // Float comparison masks static constexpr uint32_t kOne = 0x3f800000; static constexpr float kMagic = 12582912.f; static constexpr bool kBoolIsMask = true; static DISPENSO_INLINE AvxFloat sqrt(AvxFloat x) { return _mm256_sqrt_ps(x.v); } static DISPENSO_INLINE AvxFloat rcp(AvxFloat x) { return _mm256_rcp_ps(x.v); } static DISPENSO_INLINE AvxFloat fma(AvxFloat a, AvxFloat b, AvxFloat c) { #if defined(__FMA__) return _mm256_fmadd_ps(a.v, b.v, c.v); #else return _mm256_add_ps(_mm256_mul_ps(a.v, b.v), c.v); #endif } // conditional: select x where mask is true, y where false. template static DISPENSO_INLINE Arg conditional(AvxFloat mask, Arg x, Arg y); template static DISPENSO_INLINE Arg conditional(AvxInt32 mask, Arg x, Arg y); template static DISPENSO_INLINE Arg apply(AvxFloat mask, Arg x); static DISPENSO_INLINE AvxFloat min(AvxFloat a, AvxFloat b) { return _mm256_min_ps(a.v, b.v); } static DISPENSO_INLINE AvxFloat max(AvxFloat a, AvxFloat b) { return _mm256_max_ps(a.v, b.v); } }; // conditional specializations. template <> inline AvxFloat FloatTraits::conditional(AvxFloat mask, AvxFloat x, AvxFloat y) { return _mm256_blendv_ps(y.v, x.v, mask.v); } template <> inline AvxInt32 FloatTraits::conditional(AvxFloat mask, AvxInt32 x, AvxInt32 y) { return _mm256_castps_si256( _mm256_blendv_ps(_mm256_castsi256_ps(y.v), _mm256_castsi256_ps(x.v), mask.v)); } template <> inline AvxUint32 FloatTraits::conditional(AvxFloat mask, AvxUint32 x, AvxUint32 y) { return _mm256_castps_si256( _mm256_blendv_ps(_mm256_castsi256_ps(y.v), _mm256_castsi256_ps(x.v), mask.v)); } template <> inline AvxFloat FloatTraits::conditional(AvxInt32 mask, AvxFloat x, AvxFloat y) { return _mm256_blendv_ps(y.v, x.v, _mm256_castsi256_ps(mask.v)); } template <> inline AvxInt32 FloatTraits::conditional(AvxInt32 mask, AvxInt32 x, AvxInt32 y) { return _mm256_castps_si256(_mm256_blendv_ps( _mm256_castsi256_ps(y.v), _mm256_castsi256_ps(x.v), _mm256_castsi256_ps(mask.v))); } template <> inline AvxUint32 FloatTraits::conditional(AvxInt32 mask, AvxUint32 x, AvxUint32 y) { return _mm256_castps_si256(_mm256_blendv_ps( _mm256_castsi256_ps(y.v), _mm256_castsi256_ps(x.v), _mm256_castsi256_ps(mask.v))); } // apply: mask & x (bitwise AND). template <> inline AvxFloat FloatTraits::apply(AvxFloat mask, AvxFloat x) { return _mm256_and_ps(mask.v, x.v); } template <> inline AvxInt32 FloatTraits::apply(AvxFloat mask, AvxInt32 x) { return _mm256_and_si256(_mm256_castps_si256(mask.v), x.v); } template <> inline AvxUint32 FloatTraits::apply(AvxFloat mask, AvxUint32 x) { return _mm256_and_si256(_mm256_castps_si256(mask.v), x.v); } template <> struct FloatTraits { using IntType = AvxInt32; }; template <> struct FloatTraits { using IntType = AvxUint32; }; // --- Util function overloads for AVX types --- DISPENSO_INLINE AvxFloat floor_small(AvxFloat x) { return _mm256_floor_ps(x.v); } DISPENSO_INLINE AvxInt32 convert_to_int_trunc(AvxFloat f) { return _mm256_cvttps_epi32(f.v); } DISPENSO_INLINE AvxInt32 convert_to_int_trunc_safe(AvxFloat f) { AvxInt32 fi = bit_cast(f); AvxInt32 norm = (fi & 0x7f800000) != 0x7f800000; return norm & AvxInt32(_mm256_cvttps_epi32(f.v)); } DISPENSO_INLINE AvxInt32 convert_to_int(AvxFloat f) { // _mm256_cvtps_epi32 uses round-to-nearest-even. // Mask non-normals to 0 to avoid undefined behavior. AvxInt32 fi = bit_cast(f); AvxInt32 norm = (fi & 0x7f800000) != 0x7f800000; return norm & AvxInt32(_mm256_cvtps_epi32(f.v)); } template <> DISPENSO_INLINE AvxFloat min(AvxFloat x, AvxFloat mn) { // Ordering: if x is NaN, result is mn (second operand). return _mm256_min_ps(x.v, mn.v); } template <> DISPENSO_INLINE AvxFloat clamp_allow_nan(AvxFloat x, AvxFloat mn, AvxFloat mx) { return _mm256_max_ps(mn.v, _mm256_min_ps(mx.v, x.v)); } template <> DISPENSO_INLINE AvxFloat clamp_no_nan(AvxFloat x, AvxFloat mn, AvxFloat mx) { return _mm256_max_ps(mn.v, _mm256_min_ps(x.v, mx.v)); } template <> DISPENSO_INLINE AvxFloat gather(const float* table, AvxInt32 index) { return _mm256_i32gather_ps(table, index.v, 4); } DISPENSO_INLINE AvxInt32 int_div_by_3(AvxInt32 i) { // Multiply each lane by 0x55555556 and take the high 32 bits. // Process even and odd lanes separately since _mm256_mul_epu32 only uses lanes 0,2,4,6. __m256i multiplier = _mm256_set1_epi32(0x55555556); // Even lanes (0, 2, 4, 6). __m256i even = _mm256_srli_epi64(_mm256_mul_epu32(i.v, multiplier), 32); // Odd lanes (1, 3, 5, 7): shift right by 32 bits to put odd lanes in even positions. __m256i i_odd = _mm256_srli_epi64(i.v, 32); __m256i odd = _mm256_srli_epi64(_mm256_mul_epu32(i_odd, multiplier), 32); odd = _mm256_slli_epi64(odd, 32); // Blend at 32-bit granularity: even indices from even, odd from odd. return _mm256_blend_epi32(even, odd, 0xAA); // 0xAA = 10101010b } // nonnormal/nonnormalOrZero: return AvxInt32 masks for AVX types. DISPENSO_INLINE AvxInt32 nonnormal(AvxInt32 i) { return (i & 0x7f800000) == 0x7f800000; } DISPENSO_INLINE AvxInt32 nonnormalOrZero(AvxInt32 i) { auto m = i & 0x7f800000; return (m == 0x7f800000) | (m == 0); } DISPENSO_INLINE AvxInt32 nonnormal(AvxFloat f) { return nonnormal(bit_cast(f)); } // any_true: reduce SIMD mask to scalar bool (true if any lane is set). DISPENSO_INLINE bool any_true(AvxInt32 mask) { return _mm256_movemask_ps(_mm256_castsi256_ps(mask.v)) != 0; } DISPENSO_INLINE AvxFloat signof(AvxFloat x) { AvxUint32 xi = bit_cast(x); return bit_cast((xi & 0x80000000u) | FloatTraits::kOne); } DISPENSO_INLINE AvxInt32 signofi(AvxInt32 i) { return AvxInt32(1) - (AvxInt32(2) & (i < AvxInt32(0))); } // nbool_as_one: 0 if mask is true (all-ones), 1 if false (all-zeros). template <> DISPENSO_INLINE AvxFloat nbool_as_one(AvxFloat b) { // ~mask & 1.0f bits return bit_cast( AvxInt32(_mm256_andnot_si256(_mm256_castps_si256(b.v), _mm256_set1_epi32(0x3f800000)))); } template <> DISPENSO_INLINE AvxInt32 nbool_as_one(AvxFloat b) { return _mm256_andnot_si256(_mm256_castps_si256(b.v), _mm256_set1_epi32(1)); } template <> DISPENSO_INLINE AvxInt32 nbool_as_one(AvxInt32 b) { return _mm256_andnot_si256(b.v, _mm256_set1_epi32(1)); } // bool_as_one: 1 if mask is true, 0 if false. template <> DISPENSO_INLINE AvxFloat bool_as_one(AvxFloat b) { return bit_cast( AvxInt32(_mm256_and_si256(_mm256_castps_si256(b.v), _mm256_set1_epi32(0x3f800000)))); } template <> DISPENSO_INLINE AvxInt32 bool_as_one(AvxFloat b) { return _mm256_and_si256(_mm256_castps_si256(b.v), _mm256_set1_epi32(1)); } // bool_as_mask: for SIMD masks, identity (already a mask). template <> DISPENSO_INLINE AvxInt32 bool_as_mask(AvxFloat b) { return _mm256_castps_si256(b.v); } template <> DISPENSO_INLINE AvxInt32 bool_as_mask(AvxInt32 b) { return b; } template <> DISPENSO_INLINE AvxInt32 bool_as_mask(AvxUint32 b) { return b.v; } template <> DISPENSO_INLINE AvxUint32 bool_as_mask(AvxFloat b) { return _mm256_castps_si256(b.v); } template <> DISPENSO_INLINE AvxUint32 bool_as_mask(AvxUint32 b) { return b; } template <> DISPENSO_INLINE AvxUint32 bool_as_mask(AvxInt32 b) { return b.v; } } // namespace fast_math } // namespace dispenso #endif // defined(__AVX2__) ================================================ FILE: dispenso/fast_math/float_traits_avx512.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #if defined(__AVX512F__) #include #include #include "float_traits.h" #include "util.h" namespace dispenso { namespace fast_math { struct Avx512Int32; struct Avx512Uint32; // 16-bit mask wrapping __mmask16 for AVX-512 predicated operations. // Supports comparison with int literals (mask == 0 → all-false check), // logical ops, and implicit conversion to Avx512Int32 for template code // that assigns BoolType to IntType. struct Avx512Mask { __mmask16 m; Avx512Mask() = default; Avx512Mask(__mmask16 mask) : m(mask) {} // Construct from int literal: 0 → all-false, non-zero → all-true. // Used by template code like `(expr) == 0` where expr is BoolType. Avx512Mask(int val) : m(val ? 0xFFFF : 0) {} // Logical NOT. friend Avx512Mask operator!(Avx512Mask a) { return static_cast<__mmask16>(~a.m); } // Logical AND/OR/XOR. friend Avx512Mask operator&(Avx512Mask a, Avx512Mask b) { return static_cast<__mmask16>(a.m & b.m); } friend Avx512Mask operator|(Avx512Mask a, Avx512Mask b) { return static_cast<__mmask16>(a.m | b.m); } friend Avx512Mask operator^(Avx512Mask a, Avx512Mask b) { return static_cast<__mmask16>(a.m ^ b.m); } Avx512Mask& operator&=(Avx512Mask o) { m &= o.m; return *this; } Avx512Mask& operator|=(Avx512Mask o) { m |= o.m; return *this; } // Equality: per-bit XNOR. mask == Avx512Mask(0) is equivalent to !mask. friend Avx512Mask operator==(Avx512Mask a, Avx512Mask b) { return static_cast<__mmask16>(~(a.m ^ b.m)); } friend Avx512Mask operator!=(Avx512Mask a, Avx512Mask b) { return static_cast<__mmask16>(a.m ^ b.m); } // Implicit conversion to Avx512Int32 (lane-wide mask: all-ones or all-zeros). // Used by template code that assigns BoolType to IntType_t. inline operator Avx512Int32() const; }; // 16-wide float SIMD wrapper around __m512. struct Avx512Float { __m512 v; Avx512Float() = default; Avx512Float(__m512 vec) : v(vec) {} // Implicit broadcast from scalar — required for polynomial constant propagation. Avx512Float(float f) : v(_mm512_set1_ps(f)) {} // Explicit int→float conversion for (Flt)intExpr patterns. explicit Avx512Float(Avx512Int32 i); // Implicit conversion back to raw intrinsic type. operator __m512() const { return v; } Avx512Float operator-() const { // _mm512_xor_ps requires AVX-512 DQ; use integer XOR on sign bit instead. return _mm512_castsi512_ps( _mm512_xor_si512(_mm512_castps_si512(v), _mm512_set1_epi32(int32_t(0x80000000)))); } Avx512Float& operator+=(Avx512Float o) { v = _mm512_add_ps(v, o.v); return *this; } Avx512Float& operator-=(Avx512Float o) { v = _mm512_sub_ps(v, o.v); return *this; } Avx512Float& operator*=(Avx512Float o) { v = _mm512_mul_ps(v, o.v); return *this; } Avx512Float& operator&=(Avx512Float o) { // _mm512_and_ps requires AVX-512 DQ; use integer AND instead. v = _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(v), _mm512_castps_si512(o.v))); return *this; } // Arithmetic (non-member friends for symmetric implicit conversions). friend Avx512Float operator+(Avx512Float a, Avx512Float b) { return _mm512_add_ps(a.v, b.v); } friend Avx512Float operator-(Avx512Float a, Avx512Float b) { return _mm512_sub_ps(a.v, b.v); } friend Avx512Float operator*(Avx512Float a, Avx512Float b) { return _mm512_mul_ps(a.v, b.v); } friend Avx512Float operator/(Avx512Float a, Avx512Float b) { return _mm512_div_ps(a.v, b.v); } // Comparisons return Avx512Mask (native __mmask16). friend Avx512Mask operator<(Avx512Float a, Avx512Float b) { return _mm512_cmp_ps_mask(a.v, b.v, _CMP_LT_OQ); } friend Avx512Mask operator>(Avx512Float a, Avx512Float b) { return _mm512_cmp_ps_mask(a.v, b.v, _CMP_GT_OQ); } friend Avx512Mask operator<=(Avx512Float a, Avx512Float b) { return _mm512_cmp_ps_mask(a.v, b.v, _CMP_LE_OQ); } friend Avx512Mask operator>=(Avx512Float a, Avx512Float b) { return _mm512_cmp_ps_mask(a.v, b.v, _CMP_GE_OQ); } friend Avx512Mask operator==(Avx512Float a, Avx512Float b) { return _mm512_cmp_ps_mask(a.v, b.v, _CMP_EQ_OQ); } friend Avx512Mask operator!=(Avx512Float a, Avx512Float b) { return _mm512_cmp_ps_mask(a.v, b.v, _CMP_NEQ_UQ); } // Bitwise ops on float values (for sign manipulation, etc.). // _mm512_{and,or,xor}_ps require AVX-512 DQ; use integer ops + casts. friend Avx512Float operator&(Avx512Float a, Avx512Float b) { return _mm512_castsi512_ps( _mm512_and_si512(_mm512_castps_si512(a.v), _mm512_castps_si512(b.v))); } friend Avx512Float operator|(Avx512Float a, Avx512Float b) { return _mm512_castsi512_ps(_mm512_or_si512(_mm512_castps_si512(a.v), _mm512_castps_si512(b.v))); } friend Avx512Float operator^(Avx512Float a, Avx512Float b) { return _mm512_castsi512_ps( _mm512_xor_si512(_mm512_castps_si512(a.v), _mm512_castps_si512(b.v))); } }; // 16-wide int32 SIMD wrapper around __m512i. struct Avx512Int32 { __m512i v; Avx512Int32() = default; Avx512Int32(__m512i vec) : v(vec) {} Avx512Int32(int32_t i) : v(_mm512_set1_epi32(i)) {} // Reinterpret from Avx512Uint32 (no-op on __m512i). Avx512Int32(Avx512Uint32 u); // Implicit conversion back to raw intrinsic type. operator __m512i() const { return v; } Avx512Int32 operator-() const { return _mm512_sub_epi32(_mm512_setzero_si512(), v); } Avx512Int32& operator+=(Avx512Int32 o) { v = _mm512_add_epi32(v, o.v); return *this; } Avx512Int32& operator-=(Avx512Int32 o) { v = _mm512_sub_epi32(v, o.v); return *this; } Avx512Int32& operator*=(Avx512Int32 o) { v = _mm512_mullo_epi32(v, o.v); return *this; } Avx512Int32& operator&=(Avx512Int32 o) { v = _mm512_and_si512(v, o.v); return *this; } Avx512Int32& operator|=(Avx512Int32 o) { v = _mm512_or_si512(v, o.v); return *this; } // Shifts. Avx512Int32 operator<<(int n) const { return _mm512_slli_epi32(v, n); } Avx512Int32 operator>>(int n) const { return _mm512_srai_epi32(v, n); // Arithmetic shift right } Avx512Int32& operator<<=(int n) { v = _mm512_slli_epi32(v, n); return *this; } Avx512Int32& operator>>=(int n) { v = _mm512_srai_epi32(v, n); return *this; } // Arithmetic (non-member friends for symmetric implicit conversions). friend Avx512Int32 operator+(Avx512Int32 a, Avx512Int32 b) { return _mm512_add_epi32(a.v, b.v); } friend Avx512Int32 operator-(Avx512Int32 a, Avx512Int32 b) { return _mm512_sub_epi32(a.v, b.v); } friend Avx512Int32 operator*(Avx512Int32 a, Avx512Int32 b) { return _mm512_mullo_epi32(a.v, b.v); } // Bitwise. friend Avx512Int32 operator&(Avx512Int32 a, Avx512Int32 b) { return _mm512_and_si512(a.v, b.v); } friend Avx512Int32 operator|(Avx512Int32 a, Avx512Int32 b) { return _mm512_or_si512(a.v, b.v); } friend Avx512Int32 operator^(Avx512Int32 a, Avx512Int32 b) { return _mm512_xor_si512(a.v, b.v); } friend Avx512Int32 operator~(Avx512Int32 a) { return _mm512_xor_si512(a.v, _mm512_set1_epi32(-1)); } // Comparisons return Avx512Mask (native __mmask16). friend Avx512Mask operator==(Avx512Int32 a, Avx512Int32 b) { return _mm512_cmpeq_epi32_mask(a.v, b.v); } friend Avx512Mask operator!=(Avx512Int32 a, Avx512Int32 b) { return static_cast<__mmask16>(~_mm512_cmpeq_epi32_mask(a.v, b.v)); } friend Avx512Mask operator<(Avx512Int32 a, Avx512Int32 b) { return _mm512_cmplt_epi32_mask(a.v, b.v); } friend Avx512Mask operator>(Avx512Int32 a, Avx512Int32 b) { return _mm512_cmpgt_epi32_mask(a.v, b.v); } friend Avx512Mask operator!(Avx512Int32 a) { return _mm512_cmpeq_epi32_mask(a.v, _mm512_setzero_si512()); } }; // 16-wide uint32 SIMD wrapper around __m512i. struct Avx512Uint32 { __m512i v; Avx512Uint32() = default; Avx512Uint32(__m512i vec) : v(vec) {} Avx512Uint32(uint32_t u) : v(_mm512_set1_epi32(static_cast(u))) {} // Reinterpret from Avx512Int32 (no-op on __m512i). Avx512Uint32(Avx512Int32 i) : v(i.v) {} // Implicit conversion back to raw intrinsic type. operator __m512i() const { return v; } Avx512Uint32& operator+=(Avx512Uint32 o) { v = _mm512_add_epi32(v, o.v); return *this; } Avx512Uint32& operator-=(Avx512Uint32 o) { v = _mm512_sub_epi32(v, o.v); return *this; } Avx512Uint32& operator*=(Avx512Uint32 o) { v = _mm512_mullo_epi32(v, o.v); return *this; } Avx512Uint32& operator&=(Avx512Uint32 o) { v = _mm512_and_si512(v, o.v); return *this; } Avx512Uint32& operator|=(Avx512Uint32 o) { v = _mm512_or_si512(v, o.v); return *this; } // Shifts. Avx512Uint32 operator<<(int n) const { return _mm512_slli_epi32(v, n); } Avx512Uint32 operator>>(int n) const { return _mm512_srli_epi32(v, n); // Logical shift right } Avx512Uint32& operator<<=(int n) { v = _mm512_slli_epi32(v, n); return *this; } Avx512Uint32& operator>>=(int n) { v = _mm512_srli_epi32(v, n); return *this; } // Arithmetic (non-member friends for symmetric implicit conversions). friend Avx512Uint32 operator+(Avx512Uint32 a, Avx512Uint32 b) { return _mm512_add_epi32(a.v, b.v); } friend Avx512Uint32 operator-(Avx512Uint32 a, Avx512Uint32 b) { return _mm512_sub_epi32(a.v, b.v); } friend Avx512Uint32 operator*(Avx512Uint32 a, Avx512Uint32 b) { return _mm512_mullo_epi32(a.v, b.v); } // Bitwise. friend Avx512Uint32 operator&(Avx512Uint32 a, Avx512Uint32 b) { return _mm512_and_si512(a.v, b.v); } friend Avx512Uint32 operator|(Avx512Uint32 a, Avx512Uint32 b) { return _mm512_or_si512(a.v, b.v); } friend Avx512Uint32 operator^(Avx512Uint32 a, Avx512Uint32 b) { return _mm512_xor_si512(a.v, b.v); } friend Avx512Uint32 operator~(Avx512Uint32 a) { return _mm512_xor_si512(a.v, _mm512_set1_epi32(-1)); } // AVX-512 has native unsigned comparisons. friend Avx512Mask operator==(Avx512Uint32 a, Avx512Uint32 b) { return _mm512_cmpeq_epi32_mask(a.v, b.v); } friend Avx512Mask operator!=(Avx512Uint32 a, Avx512Uint32 b) { return static_cast<__mmask16>(~_mm512_cmpeq_epi32_mask(a.v, b.v)); } friend Avx512Mask operator>(Avx512Uint32 a, Avx512Uint32 b) { return _mm512_cmpgt_epu32_mask(a.v, b.v); } friend Avx512Mask operator<(Avx512Uint32 a, Avx512Uint32 b) { return _mm512_cmplt_epu32_mask(a.v, b.v); } friend Avx512Mask operator!(Avx512Uint32 a) { return _mm512_cmpeq_epi32_mask(a.v, _mm512_setzero_si512()); } }; // --- Deferred inline definitions --- // Avx512Mask → Avx512Int32: expand to lane-wide mask (all-ones or all-zeros). inline Avx512Mask::operator Avx512Int32() const { return _mm512_maskz_set1_epi32(m, -1); } // Avx512Float ↔ Avx512Int32 conversion. inline Avx512Float::Avx512Float(Avx512Int32 i) : v(_mm512_cvtepi32_ps(i.v)) {} // Avx512Int32 ↔ Avx512Uint32 reinterpret (no-op on __m512i). inline Avx512Int32::Avx512Int32(Avx512Uint32 u) : v(u.v) {} // Map raw __m512 to Avx512Float for SimdTypeFor. template <> struct SimdTypeFor<__m512> { using type = Avx512Float; }; // --- bit_cast specializations --- template <> inline Avx512Int32 bit_cast(const Avx512Float& f) noexcept { return _mm512_castps_si512(f.v); } template <> inline Avx512Uint32 bit_cast(const Avx512Float& f) noexcept { return _mm512_castps_si512(f.v); } template <> inline Avx512Float bit_cast(const Avx512Int32& i) noexcept { return _mm512_castsi512_ps(i.v); } template <> inline Avx512Float bit_cast(const Avx512Uint32& u) noexcept { return _mm512_castsi512_ps(u.v); } template <> inline Avx512Int32 bit_cast(const Avx512Uint32& u) noexcept { return u.v; } template <> inline Avx512Uint32 bit_cast(const Avx512Int32& i) noexcept { return i.v; } // --- FloatTraits --- template <> struct FloatTraits { using IntType = Avx512Int32; using UintType = Avx512Uint32; using BoolType = Avx512Mask; static constexpr uint32_t kOne = 0x3f800000; static constexpr float kMagic = 12582912.f; static constexpr bool kBoolIsMask = true; static DISPENSO_INLINE Avx512Float sqrt(Avx512Float x) { return _mm512_sqrt_ps(x.v); } static DISPENSO_INLINE Avx512Float rcp(Avx512Float x) { return _mm512_rcp14_ps(x.v); } // AVX-512 always has FMA. static DISPENSO_INLINE Avx512Float fma(Avx512Float a, Avx512Float b, Avx512Float c) { return _mm512_fmadd_ps(a.v, b.v, c.v); } // conditional: select x where mask is true, y where false. // Native __mmask16 versions — the fast path. template static DISPENSO_INLINE Arg conditional(Avx512Mask mask, Arg x, Arg y); // Lane-wide int32 mask versions — for compatibility with code using IntType masks. template static DISPENSO_INLINE Arg conditional(Avx512Int32 mask, Arg x, Arg y); // apply: zero out lanes where mask is false. template static DISPENSO_INLINE Arg apply(Avx512Mask mask, Arg x); static DISPENSO_INLINE Avx512Float min(Avx512Float a, Avx512Float b) { return _mm512_min_ps(a.v, b.v); } static DISPENSO_INLINE Avx512Float max(Avx512Float a, Avx512Float b) { return _mm512_max_ps(a.v, b.v); } }; // conditional specializations (Avx512Mask). template <> inline Avx512Float FloatTraits::conditional(Avx512Mask mask, Avx512Float x, Avx512Float y) { return _mm512_mask_blend_ps(mask.m, y.v, x.v); } template <> inline Avx512Int32 FloatTraits::conditional(Avx512Mask mask, Avx512Int32 x, Avx512Int32 y) { return _mm512_mask_blend_epi32(mask.m, y.v, x.v); } template <> inline Avx512Uint32 FloatTraits::conditional(Avx512Mask mask, Avx512Uint32 x, Avx512Uint32 y) { return _mm512_mask_blend_epi32(mask.m, y.v, x.v); } // conditional specializations (Avx512Int32 lane-wide mask). // Convert lane-wide mask to __mmask16 by checking sign bit (equivalent to movepi32_mask, // which requires AVX-512 DQ). template <> inline Avx512Float FloatTraits::conditional(Avx512Int32 mask, Avx512Float x, Avx512Float y) { __mmask16 m = _mm512_cmplt_epi32_mask(mask.v, _mm512_setzero_si512()); return _mm512_mask_blend_ps(m, y.v, x.v); } template <> inline Avx512Int32 FloatTraits::conditional(Avx512Int32 mask, Avx512Int32 x, Avx512Int32 y) { __mmask16 m = _mm512_cmplt_epi32_mask(mask.v, _mm512_setzero_si512()); return _mm512_mask_blend_epi32(m, y.v, x.v); } template <> inline Avx512Uint32 FloatTraits::conditional(Avx512Int32 mask, Avx512Uint32 x, Avx512Uint32 y) { __mmask16 m = _mm512_cmplt_epi32_mask(mask.v, _mm512_setzero_si512()); return _mm512_mask_blend_epi32(m, y.v, x.v); } // apply specializations (Avx512Mask): zero out lanes where mask is false. template <> inline Avx512Float FloatTraits::apply(Avx512Mask mask, Avx512Float x) { return _mm512_maskz_mov_ps(mask.m, x.v); } template <> inline Avx512Int32 FloatTraits::apply(Avx512Mask mask, Avx512Int32 x) { return _mm512_maskz_mov_epi32(mask.m, x.v); } template <> inline Avx512Uint32 FloatTraits::apply(Avx512Mask mask, Avx512Uint32 x) { return _mm512_maskz_mov_epi32(mask.m, x.v); } template <> struct FloatTraits { using IntType = Avx512Int32; }; template <> struct FloatTraits { using IntType = Avx512Uint32; }; // --- Util function overloads for AVX-512 types --- DISPENSO_INLINE Avx512Float floor_small(Avx512Float x) { return _mm512_floor_ps(x.v); } DISPENSO_INLINE Avx512Int32 convert_to_int_trunc(Avx512Float f) { return _mm512_cvttps_epi32(f.v); } DISPENSO_INLINE Avx512Int32 convert_to_int_trunc_safe(Avx512Float f) { Avx512Int32 fi = bit_cast(f); __mmask16 norm = static_cast<__mmask16>(~_mm512_cmpeq_epi32_mask( _mm512_and_si512(fi.v, _mm512_set1_epi32(0x7f800000)), _mm512_set1_epi32(0x7f800000))); return _mm512_maskz_cvttps_epi32(norm, f.v); } DISPENSO_INLINE Avx512Int32 convert_to_int(Avx512Float f) { // _mm512_cvtps_epi32 uses round-to-nearest-even. // Use maskz to zero non-normal lanes and avoid undefined behavior. Avx512Int32 fi = bit_cast(f); __mmask16 norm = static_cast<__mmask16>(~_mm512_cmpeq_epi32_mask( _mm512_and_si512(fi.v, _mm512_set1_epi32(0x7f800000)), _mm512_set1_epi32(0x7f800000))); return _mm512_maskz_cvtps_epi32(norm, f.v); } template <> DISPENSO_INLINE Avx512Float min(Avx512Float x, Avx512Float mn) { // Ordering: if x is NaN, result is mn (second operand). return _mm512_min_ps(x.v, mn.v); } template <> DISPENSO_INLINE Avx512Float clamp_allow_nan(Avx512Float x, Avx512Float mn, Avx512Float mx) { return _mm512_max_ps(mn.v, _mm512_min_ps(mx.v, x.v)); } template <> DISPENSO_INLINE Avx512Float clamp_no_nan(Avx512Float x, Avx512Float mn, Avx512Float mx) { return _mm512_max_ps(mn.v, _mm512_min_ps(x.v, mx.v)); } template <> DISPENSO_INLINE Avx512Float gather(const float* table, Avx512Int32 index) { // Note: AVX-512 gather has index as first arg, base as second. return _mm512_i32gather_ps(index.v, table, 4); } DISPENSO_INLINE Avx512Int32 int_div_by_3(Avx512Int32 i) { // Multiply each lane by 0x55555556 and take the high 32 bits. // Process even and odd lanes separately since _mm512_mul_epu32 only uses lanes 0,2,... __m512i multiplier = _mm512_set1_epi32(0x55555556); // Even lanes (0, 2, 4, ..., 14). __m512i even = _mm512_srli_epi64(_mm512_mul_epu32(i.v, multiplier), 32); // Odd lanes (1, 3, 5, ..., 15): shift right by 32 to put odd in even positions. __m512i i_odd = _mm512_srli_epi64(i.v, 32); __m512i odd = _mm512_srli_epi64(_mm512_mul_epu32(i_odd, multiplier), 32); odd = _mm512_slli_epi64(odd, 32); // Blend at 32-bit granularity: even indices from even, odd from odd. return _mm512_mask_blend_epi32(0xAAAA, even, odd); // 0xAAAA = odd lanes from 'odd' } // nonnormal/nonnormalOrZero: return Avx512Mask for AVX-512 types. DISPENSO_INLINE Avx512Mask nonnormal(Avx512Int32 i) { return _mm512_cmpeq_epi32_mask( _mm512_and_si512(i.v, _mm512_set1_epi32(0x7f800000)), _mm512_set1_epi32(0x7f800000)); } DISPENSO_INLINE Avx512Mask nonnormalOrZero(Avx512Int32 i) { __m512i masked = _mm512_and_si512(i.v, _mm512_set1_epi32(0x7f800000)); __mmask16 isInfNan = _mm512_cmpeq_epi32_mask(masked, _mm512_set1_epi32(0x7f800000)); __mmask16 isZero = _mm512_cmpeq_epi32_mask(masked, _mm512_setzero_si512()); return static_cast<__mmask16>(isInfNan | isZero); } DISPENSO_INLINE Avx512Mask nonnormal(Avx512Float f) { return nonnormal(bit_cast(f)); } // any_true: reduce mask to scalar bool (Avx512Mask wraps __mmask16). DISPENSO_INLINE bool any_true(Avx512Mask mask) { return mask.m != 0; } DISPENSO_INLINE Avx512Float signof(Avx512Float x) { Avx512Uint32 xi = bit_cast(x); return bit_cast((xi & 0x80000000u) | FloatTraits::kOne); } DISPENSO_INLINE Avx512Int32 signofi(Avx512Int32 i) { // Use mask blend: +1 for i >= 0, -1 for i < 0. __mmask16 neg = _mm512_cmplt_epi32_mask(i.v, _mm512_setzero_si512()); return _mm512_mask_blend_epi32(neg, _mm512_set1_epi32(1), _mm512_set1_epi32(-1)); } // nbool_as_one: 0 if mask is true, 1 if false. template <> DISPENSO_INLINE Avx512Float nbool_as_one(Avx512Mask b) { // Where mask is false (0 in mask bit), load 1.0f; where true, load 0. return _mm512_maskz_mov_ps(static_cast<__mmask16>(~b.m), _mm512_set1_ps(1.0f)); } template <> DISPENSO_INLINE Avx512Int32 nbool_as_one(Avx512Mask b) { return _mm512_maskz_set1_epi32(static_cast<__mmask16>(~b.m), 1); } template <> DISPENSO_INLINE Avx512Int32 nbool_as_one(Avx512Int32 b) { // Lane-wide mask: non-zero → "true" → return 0; zero → "false" → return 1. __mmask16 isZero = _mm512_cmpeq_epi32_mask(b.v, _mm512_setzero_si512()); return _mm512_maskz_set1_epi32(isZero, 1); } // bool_as_one: 1 if mask is true, 0 if false. template <> DISPENSO_INLINE Avx512Float bool_as_one(Avx512Mask b) { return _mm512_maskz_mov_ps(b.m, _mm512_set1_ps(1.0f)); } template <> DISPENSO_INLINE Avx512Int32 bool_as_one(Avx512Mask b) { return _mm512_maskz_set1_epi32(b.m, 1); } // bool_as_mask: convert Avx512Mask to lane-wide Avx512Int32 mask. template <> DISPENSO_INLINE Avx512Int32 bool_as_mask(Avx512Mask b) { return _mm512_maskz_set1_epi32(b.m, -1); } template <> DISPENSO_INLINE Avx512Int32 bool_as_mask(Avx512Int32 b) { return b; } template <> DISPENSO_INLINE Avx512Int32 bool_as_mask(Avx512Uint32 b) { return b.v; } template <> DISPENSO_INLINE Avx512Uint32 bool_as_mask(Avx512Mask b) { return _mm512_maskz_set1_epi32(b.m, -1); } template <> DISPENSO_INLINE Avx512Uint32 bool_as_mask(Avx512Uint32 b) { return b; } template <> DISPENSO_INLINE Avx512Uint32 bool_as_mask(Avx512Int32 b) { return b.v; } // bool_apply_or_zero: specialized for Avx512Mask to use native masked operations. template <> DISPENSO_INLINE Avx512Float bool_apply_or_zero(Avx512Mask b, Avx512Float val) { return _mm512_maskz_mov_ps(b.m, val.v); } template <> DISPENSO_INLINE Avx512Int32 bool_apply_or_zero(Avx512Mask b, Avx512Int32 val) { return _mm512_maskz_mov_epi32(b.m, val.v); } template <> DISPENSO_INLINE Avx512Uint32 bool_apply_or_zero(Avx512Mask b, Avx512Uint32 val) { return _mm512_maskz_mov_epi32(b.m, val.v); } } // namespace fast_math } // namespace dispenso #endif // defined(__AVX512F__) ================================================ FILE: dispenso/fast_math/float_traits_hwy.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #pragma once // Only include when Highway is available on the include path. // Consumers must depend on the Highway library target. #if __has_include("hwy/highway.h") #include "hwy/highway.h" #include #include "float_traits.h" #include "util.h" namespace dispenso { namespace fast_math { namespace hn = hwy::HWY_NAMESPACE; using HwyFloatTag = hn::ScalableTag; using HwyInt32Tag = hn::RebindToSigned; using HwyUint32Tag = hn::RebindToUnsigned; struct HwyInt32; struct HwyUint32; // N-wide float SIMD wrapper around Highway Vec>. // Width is determined at compile time by Highway's static dispatch. struct HwyFloat { hn::Vec v; HwyFloat() = default; HwyFloat(hn::Vec vec) : v(vec) {} // Implicit broadcast from scalar — required for polynomial constant propagation. HwyFloat(float f) : v(hn::Set(HwyFloatTag{}, f)) {} // Explicit int→float conversion for (Flt)intExpr patterns. explicit HwyFloat(HwyInt32 i); HwyFloat operator-() const { return hn::Neg(v); } HwyFloat& operator+=(HwyFloat o) { v = hn::Add(v, o.v); return *this; } HwyFloat& operator-=(HwyFloat o) { v = hn::Sub(v, o.v); return *this; } HwyFloat& operator*=(HwyFloat o) { v = hn::Mul(v, o.v); return *this; } HwyFloat& operator&=(HwyFloat o) { const HwyInt32Tag di; v = hn::BitCast(HwyFloatTag{}, hn::And(hn::BitCast(di, v), hn::BitCast(di, o.v))); return *this; } // Arithmetic (non-member friends for symmetric implicit conversions). friend HwyFloat operator+(HwyFloat a, HwyFloat b) { return hn::Add(a.v, b.v); } friend HwyFloat operator-(HwyFloat a, HwyFloat b) { return hn::Sub(a.v, b.v); } friend HwyFloat operator*(HwyFloat a, HwyFloat b) { return hn::Mul(a.v, b.v); } friend HwyFloat operator/(HwyFloat a, HwyFloat b) { return hn::Div(a.v, b.v); } // Comparisons return HwyFloat masks (all-ones or all-zeros per lane). friend HwyFloat operator<(HwyFloat a, HwyFloat b) { const HwyFloatTag d; return hn::VecFromMask(d, hn::Lt(a.v, b.v)); } friend HwyFloat operator>(HwyFloat a, HwyFloat b) { const HwyFloatTag d; return hn::VecFromMask(d, hn::Gt(a.v, b.v)); } friend HwyFloat operator<=(HwyFloat a, HwyFloat b) { const HwyFloatTag d; return hn::VecFromMask(d, hn::Le(a.v, b.v)); } friend HwyFloat operator>=(HwyFloat a, HwyFloat b) { const HwyFloatTag d; return hn::VecFromMask(d, hn::Ge(a.v, b.v)); } friend HwyFloat operator==(HwyFloat a, HwyFloat b) { const HwyFloatTag d; return hn::VecFromMask(d, hn::Eq(a.v, b.v)); } friend HwyFloat operator!=(HwyFloat a, HwyFloat b) { const HwyFloatTag d; return hn::VecFromMask(d, hn::Ne(a.v, b.v)); } // Logical NOT of a comparison mask. friend HwyFloat operator!(HwyFloat a) { const HwyFloatTag d; const HwyInt32Tag di; return hn::BitCast(d, hn::Xor(hn::BitCast(di, a.v), hn::Set(di, -1))); } // Bitwise ops on float masks. friend HwyFloat operator&(HwyFloat a, HwyFloat b) { const HwyFloatTag d; const HwyInt32Tag di; return hn::BitCast(d, hn::And(hn::BitCast(di, a.v), hn::BitCast(di, b.v))); } friend HwyFloat operator|(HwyFloat a, HwyFloat b) { const HwyFloatTag d; const HwyInt32Tag di; return hn::BitCast(d, hn::Or(hn::BitCast(di, a.v), hn::BitCast(di, b.v))); } friend HwyFloat operator^(HwyFloat a, HwyFloat b) { const HwyFloatTag d; const HwyInt32Tag di; return hn::BitCast(d, hn::Xor(hn::BitCast(di, a.v), hn::BitCast(di, b.v))); } }; // N-wide int32 SIMD wrapper around Highway Vec>>. struct HwyInt32 { hn::Vec v; HwyInt32() = default; HwyInt32(hn::Vec vec) : v(vec) {} HwyInt32(int32_t i) : v(hn::Set(HwyInt32Tag{}, i)) {} // Reinterpret from HwyUint32 (no-op). HwyInt32(HwyUint32 u); HwyInt32 operator-() const { return hn::Neg(v); } HwyInt32& operator+=(HwyInt32 o) { v = hn::Add(v, o.v); return *this; } HwyInt32& operator-=(HwyInt32 o) { v = hn::Sub(v, o.v); return *this; } HwyInt32& operator*=(HwyInt32 o) { v = hn::Mul(v, o.v); return *this; } HwyInt32& operator&=(HwyInt32 o) { v = hn::And(v, o.v); return *this; } HwyInt32& operator|=(HwyInt32 o) { v = hn::Or(v, o.v); return *this; } // Shifts. HwyInt32 operator<<(int n) const { const HwyInt32Tag di; return hn::Shl(v, hn::BitCast(di, hn::Set(HwyUint32Tag{}, static_cast(n)))); } HwyInt32 operator>>(int n) const { const HwyInt32Tag di; return hn::Shr(v, hn::BitCast(di, hn::Set(HwyUint32Tag{}, static_cast(n)))); } HwyInt32& operator<<=(int n) { const HwyInt32Tag di; v = hn::Shl(v, hn::BitCast(di, hn::Set(HwyUint32Tag{}, static_cast(n)))); return *this; } HwyInt32& operator>>=(int n) { const HwyInt32Tag di; v = hn::Shr(v, hn::BitCast(di, hn::Set(HwyUint32Tag{}, static_cast(n)))); return *this; } // Arithmetic (non-member friends for symmetric implicit conversions). friend HwyInt32 operator+(HwyInt32 a, HwyInt32 b) { return hn::Add(a.v, b.v); } friend HwyInt32 operator-(HwyInt32 a, HwyInt32 b) { return hn::Sub(a.v, b.v); } friend HwyInt32 operator*(HwyInt32 a, HwyInt32 b) { return hn::Mul(a.v, b.v); } // Bitwise. friend HwyInt32 operator&(HwyInt32 a, HwyInt32 b) { return hn::And(a.v, b.v); } friend HwyInt32 operator|(HwyInt32 a, HwyInt32 b) { return hn::Or(a.v, b.v); } friend HwyInt32 operator^(HwyInt32 a, HwyInt32 b) { return hn::Xor(a.v, b.v); } friend HwyInt32 operator~(HwyInt32 a) { return hn::Not(a.v); } // Comparisons return HwyInt32 masks (via VecFromMask). friend HwyInt32 operator==(HwyInt32 a, HwyInt32 b) { const HwyInt32Tag di; return hn::VecFromMask(di, hn::Eq(a.v, b.v)); } friend HwyInt32 operator!=(HwyInt32 a, HwyInt32 b) { const HwyInt32Tag di; return hn::VecFromMask(di, hn::Not(hn::Eq(a.v, b.v))); } friend HwyInt32 operator<(HwyInt32 a, HwyInt32 b) { const HwyInt32Tag di; return hn::VecFromMask(di, hn::Lt(a.v, b.v)); } friend HwyInt32 operator>(HwyInt32 a, HwyInt32 b) { const HwyInt32Tag di; return hn::VecFromMask(di, hn::Gt(a.v, b.v)); } friend HwyInt32 operator!(HwyInt32 a) { const HwyInt32Tag di; return hn::VecFromMask(di, hn::Eq(a.v, hn::Zero(di))); } }; // N-wide uint32 SIMD wrapper around Highway Vec>>. struct HwyUint32 { hn::Vec v; HwyUint32() = default; HwyUint32(hn::Vec vec) : v(vec) {} HwyUint32(uint32_t u) : v(hn::Set(HwyUint32Tag{}, u)) {} // Reinterpret from HwyInt32 (no-op). HwyUint32(HwyInt32 i) : v(hn::BitCast(HwyUint32Tag{}, i.v)) {} HwyUint32& operator+=(HwyUint32 o) { v = hn::Add(v, o.v); return *this; } HwyUint32& operator-=(HwyUint32 o) { v = hn::Sub(v, o.v); return *this; } HwyUint32& operator*=(HwyUint32 o) { v = hn::Mul(v, o.v); return *this; } HwyUint32& operator&=(HwyUint32 o) { v = hn::And(v, o.v); return *this; } HwyUint32& operator|=(HwyUint32 o) { v = hn::Or(v, o.v); return *this; } // Shifts. HwyUint32 operator<<(int n) const { return hn::Shl(v, hn::Set(HwyUint32Tag{}, static_cast(n))); } HwyUint32 operator>>(int n) const { return hn::Shr(v, hn::Set(HwyUint32Tag{}, static_cast(n))); } HwyUint32& operator<<=(int n) { v = hn::Shl(v, hn::Set(HwyUint32Tag{}, static_cast(n))); return *this; } HwyUint32& operator>>=(int n) { v = hn::Shr(v, hn::Set(HwyUint32Tag{}, static_cast(n))); return *this; } // Arithmetic (non-member friends for symmetric implicit conversions). friend HwyUint32 operator+(HwyUint32 a, HwyUint32 b) { return hn::Add(a.v, b.v); } friend HwyUint32 operator-(HwyUint32 a, HwyUint32 b) { return hn::Sub(a.v, b.v); } friend HwyUint32 operator*(HwyUint32 a, HwyUint32 b) { return hn::Mul(a.v, b.v); } // Bitwise. friend HwyUint32 operator&(HwyUint32 a, HwyUint32 b) { return hn::And(a.v, b.v); } friend HwyUint32 operator|(HwyUint32 a, HwyUint32 b) { return hn::Or(a.v, b.v); } friend HwyUint32 operator^(HwyUint32 a, HwyUint32 b) { return hn::Xor(a.v, b.v); } friend HwyUint32 operator~(HwyUint32 a) { return hn::Not(a.v); } // Comparisons. friend HwyUint32 operator==(HwyUint32 a, HwyUint32 b) { const HwyUint32Tag du; return hn::VecFromMask(du, hn::Eq(a.v, b.v)); } friend HwyUint32 operator!=(HwyUint32 a, HwyUint32 b) { const HwyUint32Tag du; return hn::VecFromMask(du, hn::Not(hn::Eq(a.v, b.v))); } friend HwyUint32 operator>(HwyUint32 a, HwyUint32 b) { const HwyUint32Tag du; return hn::VecFromMask(du, hn::Gt(a.v, b.v)); } friend HwyUint32 operator<(HwyUint32 a, HwyUint32 b) { const HwyUint32Tag du; return hn::VecFromMask(du, hn::Lt(a.v, b.v)); } friend HwyUint32 operator!(HwyUint32 a) { const HwyUint32Tag du; return hn::VecFromMask(du, hn::Eq(a.v, hn::Zero(du))); } }; // --- Deferred inline definitions --- // HwyFloat ↔ HwyInt32 conversion. inline HwyFloat::HwyFloat(HwyInt32 i) : v(hn::ConvertTo(HwyFloatTag{}, i.v)) {} // HwyInt32 ↔ HwyUint32 reinterpret (no-op). inline HwyInt32::HwyInt32(HwyUint32 u) : v(hn::BitCast(HwyInt32Tag{}, u.v)) {} // Map raw Highway vector to HwyFloat for SimdTypeFor. template <> struct SimdTypeFor> { using type = HwyFloat; }; // --- bit_cast specializations --- template <> inline HwyInt32 bit_cast(const HwyFloat& f) noexcept { return hn::BitCast(HwyInt32Tag{}, f.v); } template <> inline HwyUint32 bit_cast(const HwyFloat& f) noexcept { return hn::BitCast(HwyUint32Tag{}, f.v); } template <> inline HwyFloat bit_cast(const HwyInt32& i) noexcept { return hn::BitCast(HwyFloatTag{}, i.v); } template <> inline HwyFloat bit_cast(const HwyUint32& u) noexcept { return hn::BitCast(HwyFloatTag{}, u.v); } template <> inline HwyInt32 bit_cast(const HwyUint32& u) noexcept { return hn::BitCast(HwyInt32Tag{}, u.v); } template <> inline HwyUint32 bit_cast(const HwyInt32& i) noexcept { return hn::BitCast(HwyUint32Tag{}, i.v); } // --- FloatTraits --- template <> struct FloatTraits { using IntType = HwyInt32; using UintType = HwyUint32; using BoolType = HwyFloat; // Float comparison masks (lane-wide) static constexpr uint32_t kOne = 0x3f800000; static constexpr float kMagic = 12582912.f; static constexpr bool kBoolIsMask = true; static DISPENSO_INLINE HwyFloat sqrt(HwyFloat x) { return hn::Sqrt(x.v); } static DISPENSO_INLINE HwyFloat rcp(HwyFloat x) { return hn::ApproximateReciprocal(x.v); } // Highway always has FMA. static DISPENSO_INLINE HwyFloat fma(HwyFloat a, HwyFloat b, HwyFloat c) { return hn::MulAdd(a.v, b.v, c.v); } // conditional: select x where mask is true, y where false. template static DISPENSO_INLINE Arg conditional(HwyFloat mask, Arg x, Arg y); template static DISPENSO_INLINE Arg conditional(HwyInt32 mask, Arg x, Arg y); template static DISPENSO_INLINE Arg apply(HwyFloat mask, Arg x); static DISPENSO_INLINE HwyFloat min(HwyFloat a, HwyFloat b) { return hn::Min(a.v, b.v); } static DISPENSO_INLINE HwyFloat max(HwyFloat a, HwyFloat b) { return hn::Max(a.v, b.v); } }; // conditional specializations (HwyFloat mask). // MaskFromVec extracts MSB: all-ones → true, all-zeros → false. template <> inline HwyFloat FloatTraits::conditional(HwyFloat mask, HwyFloat x, HwyFloat y) { auto m = hn::MaskFromVec(mask.v); return hn::IfThenElse(m, x.v, y.v); } template <> inline HwyInt32 FloatTraits::conditional(HwyFloat mask, HwyInt32 x, HwyInt32 y) { const HwyInt32Tag di; auto imask = hn::BitCast(di, mask.v); auto m = hn::MaskFromVec(imask); return hn::IfThenElse(m, x.v, y.v); } template <> inline HwyUint32 FloatTraits::conditional(HwyFloat mask, HwyUint32 x, HwyUint32 y) { const HwyUint32Tag du; auto umask = hn::BitCast(du, mask.v); auto m = hn::MaskFromVec(umask); return hn::IfThenElse(m, x.v, y.v); } // conditional specializations (HwyInt32 lane-wide mask). template <> inline HwyFloat FloatTraits::conditional(HwyInt32 mask, HwyFloat x, HwyFloat y) { const HwyFloatTag d; auto fmask = hn::BitCast(d, mask.v); auto m = hn::MaskFromVec(fmask); return hn::IfThenElse(m, x.v, y.v); } template <> inline HwyInt32 FloatTraits::conditional(HwyInt32 mask, HwyInt32 x, HwyInt32 y) { auto m = hn::MaskFromVec(mask.v); return hn::IfThenElse(m, x.v, y.v); } template <> inline HwyUint32 FloatTraits::conditional(HwyInt32 mask, HwyUint32 x, HwyUint32 y) { const HwyUint32Tag du; auto umask = hn::BitCast(du, mask.v); auto m = hn::MaskFromVec(umask); return hn::IfThenElse(m, x.v, y.v); } // apply: mask & x (bitwise AND). template <> inline HwyFloat FloatTraits::apply(HwyFloat mask, HwyFloat x) { const HwyFloatTag d; const HwyInt32Tag di; return hn::BitCast(d, hn::And(hn::BitCast(di, mask.v), hn::BitCast(di, x.v))); } template <> inline HwyInt32 FloatTraits::apply(HwyFloat mask, HwyInt32 x) { const HwyInt32Tag di; return hn::And(hn::BitCast(di, mask.v), x.v); } template <> inline HwyUint32 FloatTraits::apply(HwyFloat mask, HwyUint32 x) { const HwyUint32Tag du; return hn::And(hn::BitCast(du, mask.v), x.v); } template <> struct FloatTraits { using IntType = HwyInt32; }; template <> struct FloatTraits { using IntType = HwyUint32; }; // --- Util function overloads for Highway types --- DISPENSO_INLINE HwyFloat floor_small(HwyFloat x) { return hn::Floor(x.v); } DISPENSO_INLINE HwyInt32 convert_to_int_trunc(HwyFloat f) { const HwyInt32Tag di; // ConvertTo truncates toward zero (no Round needed). return hn::ConvertTo(di, f.v); } DISPENSO_INLINE HwyInt32 convert_to_int_trunc_safe(HwyFloat f) { const HwyInt32Tag di; auto converted = hn::ConvertTo(di, f.v); HwyInt32 fi = bit_cast(f); HwyInt32 norm = (fi & 0x7f800000) != 0x7f800000; return norm & HwyInt32(converted); } DISPENSO_INLINE HwyInt32 convert_to_int(HwyFloat f) { const HwyInt32Tag di; // Round to nearest even, then convert to int. auto rounded = hn::Round(f.v); auto converted = hn::ConvertTo(di, rounded); // Mask non-normals to 0 to avoid undefined behavior. HwyInt32 fi = bit_cast(f); HwyInt32 norm = (fi & 0x7f800000) != 0x7f800000; return norm & HwyInt32(converted); } template <> DISPENSO_INLINE HwyFloat min(HwyFloat x, HwyFloat mn) { // If x is NaN, return mn. Explicit NaN handling for portability. auto is_num = hn::Eq(x.v, x.v); // true for non-NaN auto result = hn::Min(x.v, mn.v); return hn::IfThenElse(is_num, result, mn.v); } template <> DISPENSO_INLINE HwyFloat clamp_allow_nan(HwyFloat x, HwyFloat mn, HwyFloat mx) { // NaN propagates: if x is NaN, result is NaN. auto is_nan = hn::Not(hn::Eq(x.v, x.v)); auto clamped = hn::Max(mn.v, hn::Min(mx.v, x.v)); return hn::IfThenElse(is_nan, x.v, clamped); } template <> DISPENSO_INLINE HwyFloat clamp_no_nan(HwyFloat x, HwyFloat mn, HwyFloat mx) { // Highway's Min/Max follow IEEE 754-2019 minimum/maximum semantics: if // either operand is NaN, the result is NaN. But we want NaN-suppressing // behavior (like x86 minps/maxps): if x is NaN, result should be mn. // Clamp(x) = Max(mn, Min(x, mx)). With NaN x: // Min(NaN, mx) = NaN → Max(mn, NaN) = NaN. Wrong — we want mn. // Fix: use ordered comparison to detect NaN lanes and blend. auto clamped = hn::Max(mn.v, hn::Min(x.v, mx.v)); auto x_valid = hn::Eq(x.v, x.v); // false for NaN lanes return hn::IfThenElse(x_valid, clamped, mn.v); } template <> DISPENSO_INLINE HwyFloat gather(const float* table, HwyInt32 index) { const HwyFloatTag d; return hn::GatherIndex(d, table, index.v); } DISPENSO_INLINE HwyInt32 int_div_by_3(HwyInt32 i) { // Multiply each 32-bit lane by 0x55555556 and take the high 32 bits. // Uses widening MulEven/MulOdd to stay in SIMD registers (no scalar fallback). const HwyUint32Tag du; const HwyInt32Tag di; auto ui = hn::BitCast(du, i.v); auto mul = hn::Set(du, 0x55555556u); // Even lanes (0, 2, ...): widening 32×32→64 multiply. auto even_prod = hn::MulEven(ui, mul); // Odd lanes (1, 3, ...): widening 32×32→64 multiply. auto odd_prod = hn::MulOdd(ui, mul); // Extract high 32 bits of each 64-bit product. auto even_hi = hn::ShiftRight<32>(even_prod); auto odd_hi = hn::ShiftRight<32>(odd_prod); // Interleave: odd results into upper 32 bits of each 64-bit lane, OR with even. auto combined = hn::Or(even_hi, hn::ShiftLeft<32>(odd_hi)); return hn::BitCast(di, combined); } // nonnormal/nonnormalOrZero: return HwyInt32 masks. DISPENSO_INLINE HwyInt32 nonnormal(HwyInt32 i) { return (i & 0x7f800000) == 0x7f800000; } DISPENSO_INLINE HwyInt32 nonnormalOrZero(HwyInt32 i) { auto m = i & 0x7f800000; return (m == 0x7f800000) | (m == 0); } DISPENSO_INLINE HwyInt32 nonnormal(HwyFloat f) { return nonnormal(bit_cast(f)); } // any_true: reduce SIMD mask to scalar bool (true if any lane is set). DISPENSO_INLINE bool any_true(HwyInt32 mask) { hn::ScalableTag d; return !hn::AllFalse(d, hn::Ne(mask.v, hn::Zero(d))); } DISPENSO_INLINE HwyFloat signof(HwyFloat x) { HwyUint32 xi = bit_cast(x); return bit_cast((xi & 0x80000000u) | FloatTraits::kOne); } DISPENSO_INLINE HwyInt32 signofi(HwyInt32 i) { return HwyInt32(1) - (HwyInt32(2) & (i < HwyInt32(0))); } // nbool_as_one: 0 if mask is true (all-ones), 1 if false (all-zeros). template <> DISPENSO_INLINE HwyFloat nbool_as_one(HwyFloat b) { const HwyFloatTag d; const HwyInt32Tag di; auto bi = hn::BitCast(di, b.v); auto one_bits = hn::Set(di, 0x3f800000); return hn::BitCast(d, hn::AndNot(bi, one_bits)); } template <> DISPENSO_INLINE HwyInt32 nbool_as_one(HwyFloat b) { const HwyInt32Tag di; auto bi = hn::BitCast(di, b.v); return hn::AndNot(bi, hn::Set(di, 1)); } template <> DISPENSO_INLINE HwyInt32 nbool_as_one(HwyInt32 b) { const HwyInt32Tag di; return hn::AndNot(b.v, hn::Set(di, 1)); } // bool_as_one: 1 if mask is true, 0 if false. template <> DISPENSO_INLINE HwyFloat bool_as_one(HwyFloat b) { const HwyFloatTag d; const HwyInt32Tag di; auto bi = hn::BitCast(di, b.v); return hn::BitCast(d, hn::And(bi, hn::Set(di, 0x3f800000))); } template <> DISPENSO_INLINE HwyInt32 bool_as_one(HwyFloat b) { const HwyInt32Tag di; auto bi = hn::BitCast(di, b.v); return hn::And(bi, hn::Set(di, 1)); } // bool_as_mask: for SIMD masks, identity (already a mask). template <> DISPENSO_INLINE HwyInt32 bool_as_mask(HwyFloat b) { return hn::BitCast(HwyInt32Tag{}, b.v); } template <> DISPENSO_INLINE HwyInt32 bool_as_mask(HwyInt32 b) { return b; } template <> DISPENSO_INLINE HwyInt32 bool_as_mask(HwyUint32 b) { return hn::BitCast(HwyInt32Tag{}, b.v); } template <> DISPENSO_INLINE HwyUint32 bool_as_mask(HwyFloat b) { return hn::BitCast(HwyUint32Tag{}, b.v); } template <> DISPENSO_INLINE HwyUint32 bool_as_mask(HwyUint32 b) { return b; } template <> DISPENSO_INLINE HwyUint32 bool_as_mask(HwyInt32 b) { return hn::BitCast(HwyUint32Tag{}, b.v); } } // namespace fast_math } // namespace dispenso #endif // __has_include("hwy/highway.h") ================================================ FILE: dispenso/fast_math/float_traits_neon.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #if defined(__aarch64__) #include #include #include "float_traits.h" #include "util.h" namespace dispenso { namespace fast_math { struct NeonInt32; struct NeonUint32; // 4-wide float SIMD wrapper around float32x4_t (AArch64 NEON). struct NeonFloat { float32x4_t v; NeonFloat() = default; NeonFloat(float32x4_t vec) : v(vec) {} // Implicit broadcast from scalar — required for polynomial constant propagation. NeonFloat(float f) : v(vdupq_n_f32(f)) {} // Explicit int→float conversion for (Flt)intExpr patterns. explicit NeonFloat(NeonInt32 i); NeonFloat operator-() const { return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v), vdupq_n_u32(0x80000000))); } NeonFloat& operator+=(NeonFloat o) { v = vaddq_f32(v, o.v); return *this; } NeonFloat& operator-=(NeonFloat o) { v = vsubq_f32(v, o.v); return *this; } NeonFloat& operator*=(NeonFloat o) { v = vmulq_f32(v, o.v); return *this; } NeonFloat& operator&=(NeonFloat o) { v = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v), vreinterpretq_u32_f32(o.v))); return *this; } // Arithmetic (non-member friends for symmetric implicit conversions). friend NeonFloat operator+(NeonFloat a, NeonFloat b) { return vaddq_f32(a.v, b.v); } friend NeonFloat operator-(NeonFloat a, NeonFloat b) { return vsubq_f32(a.v, b.v); } friend NeonFloat operator*(NeonFloat a, NeonFloat b) { return vmulq_f32(a.v, b.v); } friend NeonFloat operator/(NeonFloat a, NeonFloat b) { return vdivq_f32(a.v, b.v); } // Comparisons return NeonFloat masks (all-ones or all-zeros per lane). // NEON comparisons produce uint32x4_t; reinterpret to float32x4_t. friend NeonFloat operator<(NeonFloat a, NeonFloat b) { return vreinterpretq_f32_u32(vcltq_f32(a.v, b.v)); } friend NeonFloat operator>(NeonFloat a, NeonFloat b) { return vreinterpretq_f32_u32(vcgtq_f32(a.v, b.v)); } friend NeonFloat operator<=(NeonFloat a, NeonFloat b) { return vreinterpretq_f32_u32(vcleq_f32(a.v, b.v)); } friend NeonFloat operator>=(NeonFloat a, NeonFloat b) { return vreinterpretq_f32_u32(vcgeq_f32(a.v, b.v)); } friend NeonFloat operator==(NeonFloat a, NeonFloat b) { return vreinterpretq_f32_u32(vceqq_f32(a.v, b.v)); } friend NeonFloat operator!=(NeonFloat a, NeonFloat b) { return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(a.v, b.v))); } // Logical NOT of a comparison mask. friend NeonFloat operator!(NeonFloat a) { return vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(a.v))); } // Bitwise ops on float masks. friend NeonFloat operator&(NeonFloat a, NeonFloat b) { return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v))); } friend NeonFloat operator|(NeonFloat a, NeonFloat b) { return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v))); } friend NeonFloat operator^(NeonFloat a, NeonFloat b) { return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v))); } }; // 4-wide int32 SIMD wrapper around int32x4_t. struct NeonInt32 { int32x4_t v; NeonInt32() = default; NeonInt32(int32x4_t vec) : v(vec) {} NeonInt32(int32_t i) : v(vdupq_n_s32(i)) {} // Reinterpret from NeonUint32 (no-op). NeonInt32(NeonUint32 u); NeonInt32 operator-() const { return vnegq_s32(v); } NeonInt32& operator+=(NeonInt32 o) { v = vaddq_s32(v, o.v); return *this; } NeonInt32& operator-=(NeonInt32 o) { v = vsubq_s32(v, o.v); return *this; } NeonInt32& operator*=(NeonInt32 o) { v = vmulq_s32(v, o.v); return *this; } NeonInt32& operator&=(NeonInt32 o) { v = vandq_s32(v, o.v); return *this; } NeonInt32& operator|=(NeonInt32 o) { v = vorrq_s32(v, o.v); return *this; } // Shifts (NEON uses vshlq with signed shift amount: positive=left, negative=right). NeonInt32 operator<<(int n) const { return vshlq_s32(v, vdupq_n_s32(n)); } NeonInt32 operator>>(int n) const { return vshlq_s32(v, vdupq_n_s32(-n)); // Arithmetic right shift } NeonInt32& operator<<=(int n) { v = vshlq_s32(v, vdupq_n_s32(n)); return *this; } NeonInt32& operator>>=(int n) { v = vshlq_s32(v, vdupq_n_s32(-n)); return *this; } // Arithmetic (non-member friends for symmetric implicit conversions). friend NeonInt32 operator+(NeonInt32 a, NeonInt32 b) { return vaddq_s32(a.v, b.v); } friend NeonInt32 operator-(NeonInt32 a, NeonInt32 b) { return vsubq_s32(a.v, b.v); } friend NeonInt32 operator*(NeonInt32 a, NeonInt32 b) { return vmulq_s32(a.v, b.v); } // Bitwise. friend NeonInt32 operator&(NeonInt32 a, NeonInt32 b) { return vandq_s32(a.v, b.v); } friend NeonInt32 operator|(NeonInt32 a, NeonInt32 b) { return vorrq_s32(a.v, b.v); } friend NeonInt32 operator^(NeonInt32 a, NeonInt32 b) { return veorq_s32(a.v, b.v); } friend NeonInt32 operator~(NeonInt32 a) { return vmvnq_s32(a.v); } // Comparisons return NeonInt32 masks (reinterpreted from uint32x4_t). friend NeonInt32 operator==(NeonInt32 a, NeonInt32 b) { return vreinterpretq_s32_u32(vceqq_s32(a.v, b.v)); } friend NeonInt32 operator!=(NeonInt32 a, NeonInt32 b) { return vreinterpretq_s32_u32(vmvnq_u32(vceqq_s32(a.v, b.v))); } friend NeonInt32 operator<(NeonInt32 a, NeonInt32 b) { return vreinterpretq_s32_u32(vcltq_s32(a.v, b.v)); } friend NeonInt32 operator>(NeonInt32 a, NeonInt32 b) { return vreinterpretq_s32_u32(vcgtq_s32(a.v, b.v)); } friend NeonInt32 operator!(NeonInt32 a) { return vreinterpretq_s32_u32(vceqq_s32(a.v, vdupq_n_s32(0))); } }; // 4-wide uint32 SIMD wrapper around uint32x4_t. struct NeonUint32 { uint32x4_t v; NeonUint32() = default; NeonUint32(uint32x4_t vec) : v(vec) {} NeonUint32(uint32_t u) : v(vdupq_n_u32(u)) {} // Reinterpret from NeonInt32 (no-op). NeonUint32(NeonInt32 i) : v(vreinterpretq_u32_s32(i.v)) {} NeonUint32& operator+=(NeonUint32 o) { v = vaddq_u32(v, o.v); return *this; } NeonUint32& operator-=(NeonUint32 o) { v = vsubq_u32(v, o.v); return *this; } NeonUint32& operator*=(NeonUint32 o) { v = vmulq_u32(v, o.v); return *this; } NeonUint32& operator&=(NeonUint32 o) { v = vandq_u32(v, o.v); return *this; } NeonUint32& operator|=(NeonUint32 o) { v = vorrq_u32(v, o.v); return *this; } // Shifts. NeonUint32 operator<<(int n) const { return vshlq_u32(v, vdupq_n_s32(n)); } NeonUint32 operator>>(int n) const { return vshlq_u32(v, vdupq_n_s32(-n)); // Logical right shift } NeonUint32& operator<<=(int n) { v = vshlq_u32(v, vdupq_n_s32(n)); return *this; } NeonUint32& operator>>=(int n) { v = vshlq_u32(v, vdupq_n_s32(-n)); return *this; } // Arithmetic (non-member friends for symmetric implicit conversions). friend NeonUint32 operator+(NeonUint32 a, NeonUint32 b) { return vaddq_u32(a.v, b.v); } friend NeonUint32 operator-(NeonUint32 a, NeonUint32 b) { return vsubq_u32(a.v, b.v); } friend NeonUint32 operator*(NeonUint32 a, NeonUint32 b) { return vmulq_u32(a.v, b.v); } // Bitwise. friend NeonUint32 operator&(NeonUint32 a, NeonUint32 b) { return vandq_u32(a.v, b.v); } friend NeonUint32 operator|(NeonUint32 a, NeonUint32 b) { return vorrq_u32(a.v, b.v); } friend NeonUint32 operator^(NeonUint32 a, NeonUint32 b) { return veorq_u32(a.v, b.v); } friend NeonUint32 operator~(NeonUint32 a) { return vmvnq_u32(a.v); } // NEON has native unsigned comparisons. friend NeonUint32 operator==(NeonUint32 a, NeonUint32 b) { return vceqq_u32(a.v, b.v); } friend NeonUint32 operator!=(NeonUint32 a, NeonUint32 b) { return vmvnq_u32(vceqq_u32(a.v, b.v)); } friend NeonUint32 operator>(NeonUint32 a, NeonUint32 b) { return vcgtq_u32(a.v, b.v); } friend NeonUint32 operator<(NeonUint32 a, NeonUint32 b) { return vcltq_u32(a.v, b.v); } friend NeonUint32 operator!(NeonUint32 a) { return vceqq_u32(a.v, vdupq_n_u32(0)); } }; // --- Deferred inline definitions --- // NeonFloat ↔ NeonInt32 conversion. inline NeonFloat::NeonFloat(NeonInt32 i) : v(vcvtq_f32_s32(i.v)) {} // NeonInt32 ↔ NeonUint32 reinterpret (no-op). inline NeonInt32::NeonInt32(NeonUint32 u) : v(vreinterpretq_s32_u32(u.v)) {} // Map raw float32x4_t to NeonFloat for SimdTypeFor. template <> struct SimdTypeFor { using type = NeonFloat; }; // --- bit_cast specializations --- template <> inline NeonInt32 bit_cast(const NeonFloat& f) noexcept { return vreinterpretq_s32_f32(f.v); } template <> inline NeonUint32 bit_cast(const NeonFloat& f) noexcept { return vreinterpretq_u32_f32(f.v); } template <> inline NeonFloat bit_cast(const NeonInt32& i) noexcept { return vreinterpretq_f32_s32(i.v); } template <> inline NeonFloat bit_cast(const NeonUint32& u) noexcept { return vreinterpretq_f32_u32(u.v); } template <> inline NeonInt32 bit_cast(const NeonUint32& u) noexcept { return vreinterpretq_s32_u32(u.v); } template <> inline NeonUint32 bit_cast(const NeonInt32& i) noexcept { return vreinterpretq_u32_s32(i.v); } // --- FloatTraits --- template <> struct FloatTraits { using IntType = NeonInt32; using UintType = NeonUint32; using BoolType = NeonFloat; // Float comparison masks (lane-wide, like SSE) static constexpr uint32_t kOne = 0x3f800000; static constexpr float kMagic = 12582912.f; static constexpr bool kBoolIsMask = true; static DISPENSO_INLINE NeonFloat sqrt(NeonFloat x) { return vsqrtq_f32(x.v); } static DISPENSO_INLINE NeonFloat rcp(NeonFloat x) { return vrecpeq_f32(x.v); } // AArch64 always has FMA. static DISPENSO_INLINE NeonFloat fma(NeonFloat a, NeonFloat b, NeonFloat c) { return vfmaq_f32(c.v, a.v, b.v); // vfmaq_f32(acc, a, b) = acc + a*b } // conditional: select x where mask is true, y where false. template static DISPENSO_INLINE Arg conditional(NeonFloat mask, Arg x, Arg y); template static DISPENSO_INLINE Arg conditional(NeonInt32 mask, Arg x, Arg y); template static DISPENSO_INLINE Arg apply(NeonFloat mask, Arg x); static DISPENSO_INLINE NeonFloat min(NeonFloat a, NeonFloat b) { return vminnmq_f32(a.v, b.v); } static DISPENSO_INLINE NeonFloat max(NeonFloat a, NeonFloat b) { return vmaxnmq_f32(a.v, b.v); } }; // conditional specializations (NeonFloat mask). // vbslq selects from first operand where mask bits are 1, second where 0. template <> inline NeonFloat FloatTraits::conditional(NeonFloat mask, NeonFloat x, NeonFloat y) { return vbslq_f32(vreinterpretq_u32_f32(mask.v), x.v, y.v); } template <> inline NeonInt32 FloatTraits::conditional(NeonFloat mask, NeonInt32 x, NeonInt32 y) { return vreinterpretq_s32_u32(vbslq_u32( vreinterpretq_u32_f32(mask.v), vreinterpretq_u32_s32(x.v), vreinterpretq_u32_s32(y.v))); } template <> inline NeonUint32 FloatTraits::conditional(NeonFloat mask, NeonUint32 x, NeonUint32 y) { return vbslq_u32(vreinterpretq_u32_f32(mask.v), x.v, y.v); } // conditional specializations (NeonInt32 lane-wide mask). template <> inline NeonFloat FloatTraits::conditional(NeonInt32 mask, NeonFloat x, NeonFloat y) { return vbslq_f32(vreinterpretq_u32_s32(mask.v), x.v, y.v); } template <> inline NeonInt32 FloatTraits::conditional(NeonInt32 mask, NeonInt32 x, NeonInt32 y) { return vreinterpretq_s32_u32(vbslq_u32( vreinterpretq_u32_s32(mask.v), vreinterpretq_u32_s32(x.v), vreinterpretq_u32_s32(y.v))); } template <> inline NeonUint32 FloatTraits::conditional(NeonInt32 mask, NeonUint32 x, NeonUint32 y) { return vbslq_u32(vreinterpretq_u32_s32(mask.v), x.v, y.v); } // apply: mask & x (bitwise AND). template <> inline NeonFloat FloatTraits::apply(NeonFloat mask, NeonFloat x) { return vreinterpretq_f32_u32( vandq_u32(vreinterpretq_u32_f32(mask.v), vreinterpretq_u32_f32(x.v))); } template <> inline NeonInt32 FloatTraits::apply(NeonFloat mask, NeonInt32 x) { return vreinterpretq_s32_u32( vandq_u32(vreinterpretq_u32_f32(mask.v), vreinterpretq_u32_s32(x.v))); } template <> inline NeonUint32 FloatTraits::apply(NeonFloat mask, NeonUint32 x) { return vandq_u32(vreinterpretq_u32_f32(mask.v), x.v); } template <> struct FloatTraits { using IntType = NeonInt32; }; template <> struct FloatTraits { using IntType = NeonUint32; }; // --- Util function overloads for NEON types --- DISPENSO_INLINE NeonFloat floor_small(NeonFloat x) { return vrndmq_f32(x.v); } DISPENSO_INLINE NeonInt32 convert_to_int_trunc(NeonFloat f) { return vcvtq_s32_f32(f.v); } DISPENSO_INLINE NeonInt32 convert_to_int_trunc_safe(NeonFloat f) { NeonInt32 fi = bit_cast(f); NeonInt32 norm = (fi & 0x7f800000) != 0x7f800000; return norm & NeonInt32(vcvtq_s32_f32(f.v)); } DISPENSO_INLINE NeonInt32 convert_to_int(NeonFloat f) { // vcvtnq_s32_f32 uses round-to-nearest-even. // Mask non-normals to 0 to avoid undefined behavior. NeonInt32 fi = bit_cast(f); NeonInt32 norm = (fi & 0x7f800000) != 0x7f800000; return norm & NeonInt32(vcvtnq_s32_f32(f.v)); } template <> DISPENSO_INLINE NeonFloat min(NeonFloat x, NeonFloat mn) { // vminnmq_f32: if x is NaN, returns mn (the number). NaN-suppressing. return vminnmq_f32(x.v, mn.v); } template <> DISPENSO_INLINE NeonFloat clamp_allow_nan(NeonFloat x, NeonFloat mn, NeonFloat mx) { // Use NaN-propagating min/max (FMIN/FMAX) so NaN passes through. return vmaxq_f32(mn.v, vminq_f32(mx.v, x.v)); } template <> DISPENSO_INLINE NeonFloat clamp_no_nan(NeonFloat x, NeonFloat mn, NeonFloat mx) { // Use NaN-suppressing min/max (FMINNM/FMAXNM) so NaN is replaced. return vmaxnmq_f32(mn.v, vminnmq_f32(x.v, mx.v)); } template <> DISPENSO_INLINE NeonFloat gather(const float* table, NeonInt32 index) { // No NEON gather instruction; extract indices and load scalar. alignas(16) int32_t idx[4]; vst1q_s32(idx, index.v); float vals[4] = {table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]]}; return vld1q_f32(vals); } DISPENSO_INLINE NeonInt32 int_div_by_3(NeonInt32 i) { // Multiply each lane by 0x55555556 and take the high 32 bits. // Process low (lanes 0,1) and high (lanes 2,3) halves separately. uint32x4_t ui = vreinterpretq_u32_s32(i.v); uint32x2_t mul = vdup_n_u32(0x55555556); // Low lanes (0, 1). uint64x2_t lo_prod = vmull_u32(vget_low_u32(ui), mul); uint32x2_t lo_result = vshrn_n_u64(lo_prod, 32); // High lanes (2, 3). uint64x2_t hi_prod = vmull_u32(vget_high_u32(ui), mul); uint32x2_t hi_result = vshrn_n_u64(hi_prod, 32); return vreinterpretq_s32_u32(vcombine_u32(lo_result, hi_result)); } // nonnormal/nonnormalOrZero: return NeonInt32 masks for NEON types. DISPENSO_INLINE NeonInt32 nonnormal(NeonInt32 i) { return (i & 0x7f800000) == 0x7f800000; } DISPENSO_INLINE NeonInt32 nonnormalOrZero(NeonInt32 i) { auto m = i & 0x7f800000; return (m == 0x7f800000) | (m == 0); } DISPENSO_INLINE NeonInt32 nonnormal(NeonFloat f) { return nonnormal(bit_cast(f)); } // any_true: reduce SIMD mask to scalar bool (true if any lane is set). DISPENSO_INLINE bool any_true(NeonInt32 mask) { return vmaxvq_u32(vreinterpretq_u32_s32(mask.v)) != 0; } DISPENSO_INLINE NeonFloat signof(NeonFloat x) { NeonUint32 xi = bit_cast(x); return bit_cast((xi & 0x80000000u) | FloatTraits::kOne); } DISPENSO_INLINE NeonInt32 signofi(NeonInt32 i) { return NeonInt32(1) - (NeonInt32(2) & (i < NeonInt32(0))); } // nbool_as_one: 0 if mask is true (all-ones), 1 if false (all-zeros). template <> DISPENSO_INLINE NeonFloat nbool_as_one(NeonFloat b) { // ~mask & 1.0f bits return bit_cast(NeonInt32( vreinterpretq_s32_u32(vbicq_u32(vdupq_n_u32(0x3f800000), vreinterpretq_u32_f32(b.v))))); } template <> DISPENSO_INLINE NeonInt32 nbool_as_one(NeonFloat b) { return vreinterpretq_s32_u32(vbicq_u32(vdupq_n_u32(1), vreinterpretq_u32_f32(b.v))); } template <> DISPENSO_INLINE NeonInt32 nbool_as_one(NeonInt32 b) { return vreinterpretq_s32_u32(vbicq_u32(vdupq_n_u32(1), vreinterpretq_u32_s32(b.v))); } // bool_as_one: 1 if mask is true, 0 if false. template <> DISPENSO_INLINE NeonFloat bool_as_one(NeonFloat b) { return bit_cast(NeonInt32( vreinterpretq_s32_u32(vandq_u32(vreinterpretq_u32_f32(b.v), vdupq_n_u32(0x3f800000))))); } template <> DISPENSO_INLINE NeonInt32 bool_as_one(NeonFloat b) { return vreinterpretq_s32_u32(vandq_u32(vreinterpretq_u32_f32(b.v), vdupq_n_u32(1))); } // bool_as_mask: for SIMD masks, identity (already a mask). template <> DISPENSO_INLINE NeonInt32 bool_as_mask(NeonFloat b) { return vreinterpretq_s32_f32(b.v); } template <> DISPENSO_INLINE NeonInt32 bool_as_mask(NeonInt32 b) { return b; } template <> DISPENSO_INLINE NeonInt32 bool_as_mask(NeonUint32 b) { return vreinterpretq_s32_u32(b.v); } template <> DISPENSO_INLINE NeonUint32 bool_as_mask(NeonFloat b) { return vreinterpretq_u32_f32(b.v); } template <> DISPENSO_INLINE NeonUint32 bool_as_mask(NeonUint32 b) { return b; } template <> DISPENSO_INLINE NeonUint32 bool_as_mask(NeonInt32 b) { return vreinterpretq_u32_s32(b.v); } } // namespace fast_math } // namespace dispenso #endif // defined(__aarch64__) ================================================ FILE: dispenso/fast_math/float_traits_x86.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #if defined(__SSE4_1__) #include #include #include "float_traits.h" #include "util.h" namespace dispenso { namespace fast_math { struct SseInt32; struct SseUint32; // 4-wide float SIMD wrapper around __m128. struct SseFloat { __m128 v; SseFloat() = default; SseFloat(__m128 vec) : v(vec) {} // Implicit broadcast from scalar — required for polynomial constant propagation. SseFloat(float f) : v(_mm_set1_ps(f)) {} // Explicit int→float conversion for (Flt)intExpr patterns. explicit SseFloat(SseInt32 i); // Implicit conversion back to raw intrinsic type. operator __m128() const { return v; } SseFloat operator-() const { return _mm_xor_ps(v, _mm_set1_ps(-0.0f)); } SseFloat& operator+=(SseFloat o) { v = _mm_add_ps(v, o.v); return *this; } SseFloat& operator-=(SseFloat o) { v = _mm_sub_ps(v, o.v); return *this; } SseFloat& operator*=(SseFloat o) { v = _mm_mul_ps(v, o.v); return *this; } SseFloat& operator&=(SseFloat o) { v = _mm_and_ps(v, o.v); return *this; } // Arithmetic (non-member friends for symmetric implicit conversions). friend SseFloat operator+(SseFloat a, SseFloat b) { return _mm_add_ps(a.v, b.v); } friend SseFloat operator-(SseFloat a, SseFloat b) { return _mm_sub_ps(a.v, b.v); } friend SseFloat operator*(SseFloat a, SseFloat b) { return _mm_mul_ps(a.v, b.v); } friend SseFloat operator/(SseFloat a, SseFloat b) { return _mm_div_ps(a.v, b.v); } // Comparisons return SseFloat masks (all-ones or all-zeros per lane). friend SseFloat operator<(SseFloat a, SseFloat b) { return _mm_cmplt_ps(a.v, b.v); } friend SseFloat operator>(SseFloat a, SseFloat b) { return _mm_cmpgt_ps(a.v, b.v); } friend SseFloat operator<=(SseFloat a, SseFloat b) { return _mm_cmple_ps(a.v, b.v); } friend SseFloat operator>=(SseFloat a, SseFloat b) { return _mm_cmpge_ps(a.v, b.v); } friend SseFloat operator==(SseFloat a, SseFloat b) { return _mm_cmpeq_ps(a.v, b.v); } friend SseFloat operator!=(SseFloat a, SseFloat b) { return _mm_cmpneq_ps(a.v, b.v); } // Logical NOT of a comparison mask. friend SseFloat operator!(SseFloat a) { return _mm_xor_ps(a.v, _mm_castsi128_ps(_mm_set1_epi32(-1))); } // Bitwise ops on float masks. friend SseFloat operator&(SseFloat a, SseFloat b) { return _mm_and_ps(a.v, b.v); } friend SseFloat operator|(SseFloat a, SseFloat b) { return _mm_or_ps(a.v, b.v); } friend SseFloat operator^(SseFloat a, SseFloat b) { return _mm_xor_ps(a.v, b.v); } }; // 4-wide int32 SIMD wrapper around __m128i. struct SseInt32 { __m128i v; SseInt32() = default; SseInt32(__m128i vec) : v(vec) {} SseInt32(int32_t i) : v(_mm_set1_epi32(i)) {} // Reinterpret from SseUint32 (no-op on __m128i). SseInt32(SseUint32 u); // Implicit conversion back to raw intrinsic type. operator __m128i() const { return v; } SseInt32 operator-() const { return _mm_sub_epi32(_mm_setzero_si128(), v); } SseInt32& operator+=(SseInt32 o) { v = _mm_add_epi32(v, o.v); return *this; } SseInt32& operator-=(SseInt32 o) { v = _mm_sub_epi32(v, o.v); return *this; } SseInt32& operator*=(SseInt32 o) { v = _mm_mullo_epi32(v, o.v); return *this; } SseInt32& operator&=(SseInt32 o) { v = _mm_and_si128(v, o.v); return *this; } SseInt32& operator|=(SseInt32 o) { v = _mm_or_si128(v, o.v); return *this; } // Shifts. SseInt32 operator<<(int n) const { return _mm_slli_epi32(v, n); } SseInt32 operator>>(int n) const { return _mm_srai_epi32(v, n); // Arithmetic shift right } SseInt32& operator<<=(int n) { v = _mm_slli_epi32(v, n); return *this; } SseInt32& operator>>=(int n) { v = _mm_srai_epi32(v, n); return *this; } // Arithmetic (non-member friends for symmetric implicit conversions). friend SseInt32 operator+(SseInt32 a, SseInt32 b) { return _mm_add_epi32(a.v, b.v); } friend SseInt32 operator-(SseInt32 a, SseInt32 b) { return _mm_sub_epi32(a.v, b.v); } friend SseInt32 operator*(SseInt32 a, SseInt32 b) { return _mm_mullo_epi32(a.v, b.v); // SSE4.1 } // Bitwise. friend SseInt32 operator&(SseInt32 a, SseInt32 b) { return _mm_and_si128(a.v, b.v); } friend SseInt32 operator|(SseInt32 a, SseInt32 b) { return _mm_or_si128(a.v, b.v); } friend SseInt32 operator^(SseInt32 a, SseInt32 b) { return _mm_xor_si128(a.v, b.v); } friend SseInt32 operator~(SseInt32 a) { return _mm_xor_si128(a.v, _mm_set1_epi32(-1)); } // Comparisons return SseInt32 masks. friend SseInt32 operator==(SseInt32 a, SseInt32 b) { return _mm_cmpeq_epi32(a.v, b.v); } friend SseInt32 operator!=(SseInt32 a, SseInt32 b) { return _mm_xor_si128(_mm_cmpeq_epi32(a.v, b.v), _mm_set1_epi32(-1)); } friend SseInt32 operator<(SseInt32 a, SseInt32 b) { return _mm_cmplt_epi32(a.v, b.v); } friend SseInt32 operator>(SseInt32 a, SseInt32 b) { return _mm_cmpgt_epi32(a.v, b.v); } friend SseInt32 operator!(SseInt32 a) { return _mm_cmpeq_epi32(a.v, _mm_setzero_si128()); } }; // 4-wide uint32 SIMD wrapper around __m128i. struct SseUint32 { __m128i v; SseUint32() = default; SseUint32(__m128i vec) : v(vec) {} SseUint32(uint32_t u) : v(_mm_set1_epi32(static_cast(u))) {} // Reinterpret from SseInt32 (no-op on __m128i). SseUint32(SseInt32 i) : v(i.v) {} // Implicit conversion back to raw intrinsic type. operator __m128i() const { return v; } SseUint32& operator+=(SseUint32 o) { v = _mm_add_epi32(v, o.v); return *this; } SseUint32& operator-=(SseUint32 o) { v = _mm_sub_epi32(v, o.v); return *this; } SseUint32& operator*=(SseUint32 o) { v = _mm_mullo_epi32(v, o.v); return *this; } SseUint32& operator&=(SseUint32 o) { v = _mm_and_si128(v, o.v); return *this; } SseUint32& operator|=(SseUint32 o) { v = _mm_or_si128(v, o.v); return *this; } // Shifts. SseUint32 operator<<(int n) const { return _mm_slli_epi32(v, n); } SseUint32 operator>>(int n) const { return _mm_srli_epi32(v, n); // Logical shift right } SseUint32& operator<<=(int n) { v = _mm_slli_epi32(v, n); return *this; } SseUint32& operator>>=(int n) { v = _mm_srli_epi32(v, n); return *this; } // Arithmetic (non-member friends for symmetric implicit conversions). friend SseUint32 operator+(SseUint32 a, SseUint32 b) { return _mm_add_epi32(a.v, b.v); } friend SseUint32 operator-(SseUint32 a, SseUint32 b) { return _mm_sub_epi32(a.v, b.v); } friend SseUint32 operator*(SseUint32 a, SseUint32 b) { return _mm_mullo_epi32(a.v, b.v); } // Bitwise. friend SseUint32 operator&(SseUint32 a, SseUint32 b) { return _mm_and_si128(a.v, b.v); } friend SseUint32 operator|(SseUint32 a, SseUint32 b) { return _mm_or_si128(a.v, b.v); } friend SseUint32 operator^(SseUint32 a, SseUint32 b) { return _mm_xor_si128(a.v, b.v); } friend SseUint32 operator~(SseUint32 a) { return _mm_xor_si128(a.v, _mm_set1_epi32(-1)); } // Unsigned comparisons need XOR with sign bit to convert to signed domain. friend SseUint32 operator==(SseUint32 a, SseUint32 b) { return _mm_cmpeq_epi32(a.v, b.v); } friend SseUint32 operator!=(SseUint32 a, SseUint32 b) { return _mm_xor_si128(_mm_cmpeq_epi32(a.v, b.v), _mm_set1_epi32(-1)); } friend SseUint32 operator>(SseUint32 a, SseUint32 b) { __m128i bias = _mm_set1_epi32(static_cast(0x80000000u)); return _mm_cmpgt_epi32(_mm_xor_si128(a.v, bias), _mm_xor_si128(b.v, bias)); } friend SseUint32 operator<(SseUint32 a, SseUint32 b) { return b > a; } friend SseUint32 operator!(SseUint32 a) { return _mm_cmpeq_epi32(a.v, _mm_setzero_si128()); } }; // SseFloat ↔ SseInt32 conversion. inline SseFloat::SseFloat(SseInt32 i) : v(_mm_cvtepi32_ps(i.v)) {} // SseInt32 ↔ SseUint32 reinterpret (no-op on __m128i). inline SseInt32::SseInt32(SseUint32 u) : v(u.v) {} // Map raw __m128 to SseFloat for SimdTypeFor. template <> struct SimdTypeFor<__m128> { using type = SseFloat; }; // --- bit_cast specializations --- template <> inline SseInt32 bit_cast(const SseFloat& f) noexcept { return _mm_castps_si128(f.v); } template <> inline SseUint32 bit_cast(const SseFloat& f) noexcept { return _mm_castps_si128(f.v); } template <> inline SseFloat bit_cast(const SseInt32& i) noexcept { return _mm_castsi128_ps(i.v); } template <> inline SseFloat bit_cast(const SseUint32& u) noexcept { return _mm_castsi128_ps(u.v); } template <> inline SseInt32 bit_cast(const SseUint32& u) noexcept { return u.v; } template <> inline SseUint32 bit_cast(const SseInt32& i) noexcept { return i.v; } // --- FloatTraits --- template <> struct FloatTraits { using IntType = SseInt32; using UintType = SseUint32; using BoolType = SseFloat; // Float comparison masks static constexpr uint32_t kOne = 0x3f800000; static constexpr float kMagic = 12582912.f; static constexpr bool kBoolIsMask = true; static DISPENSO_INLINE SseFloat sqrt(SseFloat x) { return _mm_sqrt_ps(x.v); } static DISPENSO_INLINE SseFloat rcp(SseFloat x) { return _mm_rcp_ps(x.v); } static DISPENSO_INLINE SseFloat fma(SseFloat a, SseFloat b, SseFloat c) { #if defined(__FMA__) return _mm_fmadd_ps(a.v, b.v, c.v); #else return _mm_add_ps(_mm_mul_ps(a.v, b.v), c.v); #endif } // conditional: select x where mask is true, y where false. template static DISPENSO_INLINE Arg conditional(SseFloat mask, Arg x, Arg y); template static DISPENSO_INLINE Arg conditional(SseInt32 mask, Arg x, Arg y); template static DISPENSO_INLINE Arg apply(SseFloat mask, Arg x); static DISPENSO_INLINE SseFloat min(SseFloat a, SseFloat b) { return _mm_min_ps(a.v, b.v); } static DISPENSO_INLINE SseFloat max(SseFloat a, SseFloat b) { return _mm_max_ps(a.v, b.v); } }; // conditional specializations. template <> inline SseFloat FloatTraits::conditional(SseFloat mask, SseFloat x, SseFloat y) { return _mm_blendv_ps(y.v, x.v, mask.v); } template <> inline SseInt32 FloatTraits::conditional(SseFloat mask, SseInt32 x, SseInt32 y) { return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(y.v), _mm_castsi128_ps(x.v), mask.v)); } template <> inline SseUint32 FloatTraits::conditional(SseFloat mask, SseUint32 x, SseUint32 y) { return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(y.v), _mm_castsi128_ps(x.v), mask.v)); } template <> inline SseFloat FloatTraits::conditional(SseInt32 mask, SseFloat x, SseFloat y) { return _mm_blendv_ps(y.v, x.v, _mm_castsi128_ps(mask.v)); } template <> inline SseInt32 FloatTraits::conditional(SseInt32 mask, SseInt32 x, SseInt32 y) { return _mm_castps_si128( _mm_blendv_ps(_mm_castsi128_ps(y.v), _mm_castsi128_ps(x.v), _mm_castsi128_ps(mask.v))); } template <> inline SseUint32 FloatTraits::conditional(SseInt32 mask, SseUint32 x, SseUint32 y) { return _mm_castps_si128( _mm_blendv_ps(_mm_castsi128_ps(y.v), _mm_castsi128_ps(x.v), _mm_castsi128_ps(mask.v))); } // apply: mask & x (bitwise AND). template <> inline SseFloat FloatTraits::apply(SseFloat mask, SseFloat x) { return _mm_and_ps(mask.v, x.v); } template <> inline SseInt32 FloatTraits::apply(SseFloat mask, SseInt32 x) { return _mm_and_si128(_mm_castps_si128(mask.v), x.v); } template <> inline SseUint32 FloatTraits::apply(SseFloat mask, SseUint32 x) { return _mm_and_si128(_mm_castps_si128(mask.v), x.v); } template <> struct FloatTraits { using IntType = SseInt32; }; template <> struct FloatTraits { using IntType = SseUint32; }; // --- Util function overloads for SSE types --- DISPENSO_INLINE SseFloat floor_small(SseFloat x) { return _mm_floor_ps(x.v); } DISPENSO_INLINE SseInt32 convert_to_int_trunc(SseFloat f) { return _mm_cvttps_epi32(f.v); } DISPENSO_INLINE SseInt32 convert_to_int_trunc_safe(SseFloat f) { SseInt32 fi = bit_cast(f); SseInt32 norm = (fi & 0x7f800000) != 0x7f800000; return norm & SseInt32(_mm_cvttps_epi32(f.v)); } DISPENSO_INLINE SseInt32 convert_to_int(SseFloat f) { // _mm_cvtps_epi32 uses round-to-nearest-even. // Mask non-normals to 0 to avoid undefined behavior. SseInt32 fi = bit_cast(f); SseInt32 norm = (fi & 0x7f800000) != 0x7f800000; return norm & SseInt32(_mm_cvtps_epi32(f.v)); } template <> DISPENSO_INLINE SseFloat min(SseFloat x, SseFloat mn) { // Ordering: if x is NaN, result is mn (second operand). return _mm_min_ps(x.v, mn.v); } template <> DISPENSO_INLINE SseFloat clamp_allow_nan(SseFloat x, SseFloat mn, SseFloat mx) { return _mm_max_ps(mn.v, _mm_min_ps(mx.v, x.v)); } template <> DISPENSO_INLINE SseFloat clamp_no_nan(SseFloat x, SseFloat mn, SseFloat mx) { return _mm_max_ps(mn.v, _mm_min_ps(x.v, mx.v)); } template <> DISPENSO_INLINE SseFloat gather(const float* table, SseInt32 index) { alignas(16) int32_t idx[4]; _mm_store_si128(reinterpret_cast<__m128i*>(idx), index.v); return _mm_set_ps(table[idx[3]], table[idx[2]], table[idx[1]], table[idx[0]]); } DISPENSO_INLINE SseInt32 int_div_by_3(SseInt32 i) { // Multiply each lane by 0x55555556 and take the high 32 bits. // Process even and odd lanes separately since _mm_mul_epu32 only uses lanes 0,2. __m128i multiplier = _mm_set1_epi32(0x55555556); // Even lanes (0, 2). __m128i even = _mm_srli_epi64(_mm_mul_epu32(i.v, multiplier), 32); // Odd lanes (1, 3): shift right by 32 bits to put odd lanes in even positions. __m128i i_odd = _mm_srli_epi64(i.v, 32); __m128i odd = _mm_srli_epi64(_mm_mul_epu32(i_odd, multiplier), 32); odd = _mm_slli_epi64(odd, 32); // Blend even and odd results. return _mm_blend_epi16(even, odd, 0xCC); // 0xCC = 11001100b selects odd from odd } // nonnormal/nonnormalOrZero: return SseInt32 masks for SSE types. DISPENSO_INLINE SseInt32 nonnormal(SseInt32 i) { return (i & 0x7f800000) == 0x7f800000; } DISPENSO_INLINE SseInt32 nonnormalOrZero(SseInt32 i) { auto m = i & 0x7f800000; return (m == 0x7f800000) | (m == 0); } DISPENSO_INLINE SseInt32 nonnormal(SseFloat f) { return nonnormal(bit_cast(f)); } // any_true: reduce SIMD mask to scalar bool (true if any lane is set). DISPENSO_INLINE bool any_true(SseInt32 mask) { return _mm_movemask_ps(_mm_castsi128_ps(mask.v)) != 0; } DISPENSO_INLINE SseFloat signof(SseFloat x) { SseUint32 xi = bit_cast(x); return bit_cast((xi & 0x80000000u) | FloatTraits::kOne); } DISPENSO_INLINE SseInt32 signofi(SseInt32 i) { return SseInt32(1) - (SseInt32(2) & (i < SseInt32(0))); } // nbool_as_one: 0 if mask is true (all-ones), 1 if false (all-zeros). template <> DISPENSO_INLINE SseFloat nbool_as_one(SseFloat b) { // ~mask & 1.0f bits return bit_cast( SseInt32(_mm_andnot_si128(_mm_castps_si128(b.v), _mm_set1_epi32(0x3f800000)))); } template <> DISPENSO_INLINE SseInt32 nbool_as_one(SseFloat b) { return _mm_andnot_si128(_mm_castps_si128(b.v), _mm_set1_epi32(1)); } template <> DISPENSO_INLINE SseInt32 nbool_as_one(SseInt32 b) { return _mm_andnot_si128(b.v, _mm_set1_epi32(1)); } // bool_as_one: 1 if mask is true, 0 if false. template <> DISPENSO_INLINE SseFloat bool_as_one(SseFloat b) { return bit_cast( SseInt32(_mm_and_si128(_mm_castps_si128(b.v), _mm_set1_epi32(0x3f800000)))); } template <> DISPENSO_INLINE SseInt32 bool_as_one(SseFloat b) { return _mm_and_si128(_mm_castps_si128(b.v), _mm_set1_epi32(1)); } // bool_as_mask: for SIMD masks, identity (already a mask). template <> DISPENSO_INLINE SseInt32 bool_as_mask(SseFloat b) { return _mm_castps_si128(b.v); } template <> DISPENSO_INLINE SseInt32 bool_as_mask(SseInt32 b) { return b; } template <> DISPENSO_INLINE SseInt32 bool_as_mask(SseUint32 b) { return b.v; } template <> DISPENSO_INLINE SseUint32 bool_as_mask(SseFloat b) { return _mm_castps_si128(b.v); } template <> DISPENSO_INLINE SseUint32 bool_as_mask(SseUint32 b) { return b; } template <> DISPENSO_INLINE SseUint32 bool_as_mask(SseInt32 b) { return b.v; } } // namespace fast_math } // namespace dispenso #endif // defined(__SSE4_1__) ================================================ FILE: dispenso/fast_math/simd.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ // Convenience header that provides DefaultSimdFloat — the widest available // SIMD float type for the current platform. Include this header and use // DefaultSimdFloat in generic code that should auto-vectorize to the best // available SIMD width. // // Example: // #include // using F = dispenso::fast_math::DefaultSimdFloat; // F result = dispenso::fast_math::sin(F(1.0f)); #pragma once // DefaultSimdFloat is defined in util.h (after SIMD backend includes). // This header just pulls in fast_math.h which includes util.h. #include ================================================ FILE: dispenso/fast_math/util.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #if defined(__SSE__) || defined(__AVX__) #include #endif // __SSE__ #include #include #include "float_traits.h" namespace dispenso { namespace fast_math { // Reinterpret the bits of `src` as type `To`. Equivalent to std::bit_cast (C++20). template DISPENSO_INLINE std::enable_if_t< sizeof(To) == sizeof(From) && std::is_trivially_copyable_v && std::is_trivially_copyable_v, To> bit_cast(const From& src) noexcept { static_assert( std::is_default_constructible_v, "This implementation additionally requires " "destination type to be default constructible"); To dst; memcpy(&dst, &src, sizeof(To)); return dst; } // Return the ULP distance between two floats. Denormals are treated as zero. // Handles mixed-sign comparisons correctly by mapping IEEE 754 bit patterns // to a linear integer representation where negative floats map to negative integers. DISPENSO_INLINE uint32_t float_distance(float a, float b) { uint32_t ai = bit_cast(a); uint32_t bi = bit_cast(b); // Handle denormal values as zero ai = ((ai & 0x7f800000) == 0) ? 0 : ai; bi = ((bi & 0x7f800000) == 0) ? 0 : bi; // Map to linear integer space: positive floats stay as-is, // negative floats map to negative integers (sign-magnitude → two's complement). auto toLinear = [](uint32_t bits) -> int32_t { return (bits & 0x80000000u) ? static_cast(0x80000000u - bits) : static_cast(bits); }; int32_t la = toLinear(ai); int32_t lb = toLinear(bi); int32_t diff = la - lb; return static_cast(diff < 0 ? -diff : diff); } // Absolute value: clears the sign bit. Works for all float/SIMD types. template DISPENSO_INLINE Flt fabs(Flt x) { if constexpr (!std::is_same_v>) { return fabs(SimdType_t(x)).v; } else { using Uint = UintType_t; return bit_cast(bit_cast(x) & Uint(0x7fffffff)); } } // Return +1.0 or -1.0 matching the sign bit of x. +0 gives +1, -0 gives -1. template DISPENSO_INLINE Flt signof(Flt x) { if constexpr (!std::is_same_v>) { return signof(SimdType_t(x)).v; } else { using Uint = UintType_t; return bit_cast((bit_cast(x) & 0x80000000) | FloatTraits::kOne); } } // Integer sign: returns +1 for i >= 0, -1 for i < 0. Scalar version (kBoolIsMask=false). template DISPENSO_INLINE std::enable_if_t::kBoolIsMask, IntType_t> signofi( IntType_t i) { return 1 - 2 * (i < 0); } // Integer sign: returns +1 for i >= 0, -1 for i < 0. SIMD mask version (kBoolIsMask=true). template DISPENSO_INLINE std::enable_if_t::kBoolIsMask, IntType_t> signofi( IntType_t i) { // Explicit IntType_t cast handles AVX-512 where (i < 0) returns Avx512Mask, not Avx512Int32. auto neg = IntType_t(i < 0); return 1 - (2 & neg); } // True if the float (as int bits) has all exponent bits set (inf or NaN). // Returns bool for scalar types, SIMD lane mask for SIMD types. template DISPENSO_INLINE auto nonnormal(IntType_t i) { return (i & 0x7f800000) == 0x7f800000; } // True if the float (as int bits) is inf, NaN, or zero. // Returns bool for scalar types, SIMD lane mask for SIMD types. template DISPENSO_INLINE auto nonnormalOrZero(IntType_t i) { auto m = i & 0x7f800000; return (m == 0x7f800000) | (m == 0); } template DISPENSO_INLINE auto nonnormal(Flt f) { return nonnormal(bit_cast>(f)); } // Truncate float toward zero, returning int. No inf/NaN guard — caller must // ensure inputs are finite (e.g. after range reduction by a finite constant). // Maps to cvttps2epi32 (SSE), vcvtq_s32_f32 (NEON), or (int)f (scalar). template DISPENSO_INLINE IntType_t convert_to_int_trunc(Flt f) { if constexpr (!std::is_same_v>) { return convert_to_int_trunc(SimdType_t(f)).v; } else { return static_cast>(f); } } // Truncate float toward zero, returning int. Inf/NaN lanes are zeroed. // Use when the input may contain non-finite values. template DISPENSO_INLINE IntType_t convert_to_int_trunc_safe(Flt f) { if constexpr (!std::is_same_v>) { return convert_to_int_trunc_safe(SimdType_t(f)).v; } else { auto fi = bit_cast>(f); if ((fi & 0x7f800000) == 0x7f800000) { return 0; } return static_cast>(f); } } // Convert float to int using round-to-nearest-even (SSE cvtss2si semantics). // For non-normal inputs (inf/NaN), returns 0 to avoid undefined behavior. template DISPENSO_INLINE IntType_t convert_to_int(Flt f) { if constexpr (!std::is_same_v>) { return convert_to_int(SimdType_t(f)).v; } else { #if defined(__SSE__) return _mm_cvtss_si32(_mm_set_ss(f)); #else // Round to nearest even via magic number addition, guard non-normals → 0. auto fi = bit_cast>(f); if ((fi & 0x7f800000) == 0x7f800000) { return 0; } constexpr float kMagic = FloatTraits::kMagic; // 1.5f * 2^23 return static_cast>((f + kMagic) - kMagic); #endif // __SSE__ } } // Convert float to int with round-to-nearest-even, clamping to [kMin, kMax]. // For non-normal inputs (inf/NaN), returns 0 to avoid undefined behavior. template kMin, IntType_t kMax> DISPENSO_INLINE IntType_t convert_to_int_clamped(Flt f) { #if defined(__SSE4_1__) static const __m128i kMn = _mm_set1_epi32(kMin); static const __m128i kMx = _mm_set1_epi32(kMax); __m128i i = _mm_cvtps_epi32(_mm_set_ss(f)); i = _mm_min_epi32(i, kMx); i = _mm_max_epi32(i, kMn); return _mm_cvtsi128_si32(i); #else // Round to nearest even via magic number addition (same technique as rangeReduce), // then clamp. Guard against non-normal inputs (inf/NaN) → return 0. auto fi = bit_cast>(f); if ((fi & 0x7f800000) == 0x7f800000) { return 0; } constexpr float kMagic = FloatTraits::kMagic; // 1.5f * 2^23 auto rounded = static_cast>((f + kMagic) - kMagic); if (rounded > kMax) return kMax; if (rounded < kMin) return kMin; return rounded; #endif // __SSE4_1__ } // Floor for values within integer range. Uses SSE4.1 roundss when available. template DISPENSO_INLINE Flt floor_small(Flt x) { if constexpr (!std::is_same_v>) { return floor_small(SimdType_t(x)).v; } else { IntType_t i = x; Flt xi = i; if constexpr (FloatTraits::kBoolIsMask) { return i - ((x < xi) & 1); } else { return i - (x < xi) * 1; } } } template <> DISPENSO_INLINE float floor_small(float x) { #if defined(__SSE4_1__) __m128 f = _mm_set_ss(x); __m128 r = _mm_floor_ss(f, f); return _mm_cvtss_f32(r); #else return std::floor(x); #endif // __SSE4_1__ } // Minimum of x and mn. If x is NaN and mn is not, returns mn (relied-upon NaN behavior). template DISPENSO_INLINE Flt min(Flt x, Flt mn); template <> DISPENSO_INLINE float min(float x, float mn) { #if defined(__SSE4_1__) __m128 f = _mm_set_ss(x); __m128 fmn = _mm_set_ss(mn); // Ordering matters here for NaN behavior __m128 r = _mm_min_ss(f, fmn); return _mm_cvtss_f32(r); #else return x < mn ? x : mn; #endif //__SSE4_1__ } // Clamp x to [mn, mx]. If x is NaN, NaN propagates (result is NaN). // Use when NaN should signal an error rather than be silently clamped. template DISPENSO_INLINE Flt clamp_allow_nan(Flt x, Flt mn, Flt mx); template <> DISPENSO_INLINE float clamp_allow_nan(float x, float mn, float mx) { #if defined(__SSE4_1__) __m128 f = _mm_set_ss(x); __m128 fmn = _mm_set_ss(mn); __m128 fmx = _mm_set_ss(mx); // Ordering matters here for NaN behavior __m128 r = _mm_max_ss(fmn, _mm_min_ss(fmx, f)); return _mm_cvtss_f32(r); #else mx = (mx < x) ? mx : x; return (mx < mn) ? mn : mx; #endif // __SSE4_1__ } // Clamp x to [mn, mx]. If x is NaN, returns a value in [mn, mx] (NaN is suppressed). // Use when a valid output is required regardless of input. template DISPENSO_INLINE Flt clamp_no_nan(Flt x, Flt mn, Flt mx); template <> DISPENSO_INLINE float clamp_no_nan(float x, float mn, float mx) { #if defined(__SSE4_1__) __m128 f = _mm_set_ss(x); __m128 fmn = _mm_set_ss(mn); __m128 fmx = _mm_set_ss(mx); // Ordering matters here for NaN behavior __m128 r = _mm_max_ss(fmn, _mm_min_ss(f, fmx)); return _mm_cvtss_f32(r); // TODO: See vrndmq_f32 for ARM NEON #else mx = (mx > x) ? x : mx; return (mx > mn) ? mx : mn; #endif // __SSE4_1__ } // Load table[index]. Scalar version is a plain array access; SIMD versions use gather instructions. template DISPENSO_INLINE Flt gather(const float* table, IntType_t index); template <> DISPENSO_INLINE float gather(const float* table, int32_t index) { return table[index]; } // Return 0 if b is true, 1 if b is false (negated bool as numeric). template DISPENSO_INLINE T nbool_as_one(BoolT b); template <> DISPENSO_INLINE float nbool_as_one(bool b) { return b ? 0.0f : 1.0f; } template <> DISPENSO_INLINE int32_t nbool_as_one(bool b) { return static_cast(!b); } // Return 1 if b is true, 0 if b is false (bool as numeric). template DISPENSO_INLINE T bool_as_one(BoolT b); template <> DISPENSO_INLINE float bool_as_one(bool b) { return b ? 1.0f : 0.0f; } template <> DISPENSO_INLINE int32_t bool_as_one(bool b) { return static_cast(b); } // Convert bool to bitmask: true → all-ones (0xFFFFFFFF), false → 0x0. // For SIMD types, this converts a lane mask to an integer mask for bitwise ops. template DISPENSO_INLINE T bool_as_mask(BoolT b) { T mask = b; return ~(mask - 1); } // Return val if b is true, 0 otherwise. Uses bitwise AND via bool_as_mask. template DISPENSO_INLINE T bool_apply_or_zero(BoolT b, T val) { return bit_cast(bool_as_mask>(b) & bit_cast>(val)); } // True if x is an integer. Safe for all float values including large, inf, NaN. // All floats with |x| >= 2^23 are integers (mantissa has no fractional bits). // For |x| < 2^23, floor_small(x) is valid (no int overflow). template DISPENSO_INLINE BoolType_t float_is_int(Flt x) { auto ax = fabs(x); return (ax >= Flt(8388608.0f)) | (floor_small(x) == x); } // True if x is an odd integer. // For |x| >= 2^24: ULP >= 2, so only even integers are representable → always false. template DISPENSO_INLINE BoolType_t float_is_odd(Flt x) { if constexpr (std::is_same_v) { return float_is_int(x) && !float_is_int(x * 0.5f); } else { return float_is_int(x) & !float_is_int(x * Flt(0.5f)); } } // Fast integer division by 3 using multiply-and-shift. Used in cbrt magic constant computation. DISPENSO_INLINE int32_t int_div_by_3(int32_t i) { return static_cast((uint64_t(i) * 0x55555556) >> 32); } // --------------------------------------------------------------------------- // Polynomial evaluation: hornerEval, estrinEval, polyEval // --------------------------------------------------------------------------- // // All take coefficients in HIGH-to-LOW order (highest degree first): // polyEval(x, cn, cn-1, ..., c1, c0) = cn*x^n + cn-1*x^(n-1) + ... + c1*x + c0 // // This matches the natural Horner evaluation order and the existing hand-written // FMA chains in fast_math. For odd/even polynomial forms like tanh(x) = x*(1+x²*Q(x²)), // compose as: x * (1 + x2 * polyEval(x2, qn, ..., q1, q0)). namespace detail { // --- Horner evaluation --- // Sequential FMA chain, optimal for low-ILP targets (GPU, scalar, narrow SIMD). // Degree N polynomial = N FMAs. template DISPENSO_INLINE Flt hornerImpl(Flt, Flt accum) { return accum; } template DISPENSO_INLINE Flt hornerImpl(Flt x, Flt accum, Flt next, Cs... rest) { return hornerImpl(x, FloatTraits::fma(accum, x, next), rest...); } // --- Estrin evaluation --- // Tree-reduces paired coefficients at each level, cutting critical-path depth // from N to ceil(log2(N+1)) at the cost of extra multiplies for x powers. // Better for wide SIMD with high ILP (AVX, AVX-512). // // Generic peel-and-recurse via tuples. At each level: // 1. Pair adjacent values: (a,b) → fma(a, xp, b) // 2. Carry unpaired odd element // 3. Recurse with xp² and the paired results // // All tuple operations (make_tuple, tuple_cat, apply) are eliminated by the // optimizer — verified to produce identical assembly to hand-written FMA trees // with clang 21, GCC 11, and GCC 15 at -O2. template struct EstrinImpl { // Base cases DISPENSO_INLINE static Flt reduce(Flt, std::tuple done) { return std::get<0>(done); } DISPENSO_INLINE static Flt reduce(Flt xp, std::tuple done) { return FloatTraits::fma(std::get<0>(done), xp, std::get<1>(done)); } // General: unpack tuple, pair all elements at this level, recurse template DISPENSO_INLINE static Flt reduce(Flt xp, std::tuple done) { return std::apply([xp](auto... ds) { return pairLevel(xp, std::tuple<>{}, ds...); }, done); } // Done pairing at this level — recurse with xp² template DISPENSO_INLINE static Flt pairLevel(Flt xp, std::tuple paired) { return reduce(xp * xp, paired); } // Odd leftover — carry forward unpaired template DISPENSO_INLINE static Flt pairLevel(Flt xp, std::tuple paired, Flt a) { return reduce(xp * xp, std::tuple_cat(paired, std::make_tuple(a))); } // Peel two from front, pair via FMA, accumulate into paired tuple template DISPENSO_INLINE static Flt pairLevel(Flt xp, std::tuple paired, Flt a, Flt b, Rest... rest) { return pairLevel( xp, std::tuple_cat(paired, std::make_tuple(FloatTraits::fma(a, xp, b))), rest...); } }; } // namespace detail // Horner evaluation: hornerEval(x, cn, cn-1, ..., c0) = ((cn*x + cn-1)*x + ...)*x + c0 // Coefficients HIGH-to-LOW (highest degree first). // Flt is deduced from x; coefficients are converted to Flt internally. template DISPENSO_INLINE Flt hornerEval(Flt x, C0 cn, Cs... rest) { return detail::hornerImpl(x, Flt(cn), Flt(rest)...); } // Estrin evaluation: same semantics as hornerEval, but uses tree-reduction // for lower critical-path depth (ceil(log2(N)) vs N dependent FMAs). // Coefficients HIGH-to-LOW. template DISPENSO_INLINE Flt estrinEval(Flt x, C0 cn, Cs... rest) { return detail::EstrinImpl::reduce(x, std::make_tuple(Flt(cn), Flt(rest)...)); } // Platform-adaptive polynomial evaluation. // Uses Estrin on CPU (tree reduction for ILP on wide SIMD), Horner on GPU // (sequential FMA chain, no wasted registers on in-order pipelines). // Use polyEval only at call sites where Estrin has been measured beneficial. // Use hornerEval directly for accuracy-sensitive polynomials where Estrin's // different rounding order would regress ULP. // Coefficients HIGH-to-LOW: polyEval(x, cn, ..., c0). template DISPENSO_INLINE Flt polyEval(Flt x, C0 cn, Cs... rest) { #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) return hornerEval(x, cn, rest...); #else return estrinEval(x, cn, rest...); #endif } } // namespace fast_math } // namespace dispenso // Auto-detect SIMD backends and include FloatTraits specializations. #if defined(__SSE4_1__) #include #endif #if defined(__AVX2__) #include #endif #if defined(__AVX512F__) #include #endif #if defined(__aarch64__) #include #endif #if __has_include("hwy/highway.h") #include #endif namespace dispenso { namespace fast_math { // Best available SIMD float type for the current platform. // Prefer native intrinsic wrappers over Highway for lower overhead. // Highway is a fallback for platforms without a native wrapper. #if defined(__aarch64__) using DefaultSimdFloat = NeonFloat; #elif defined(__AVX512F__) using DefaultSimdFloat = Avx512Float; #elif defined(__AVX2__) using DefaultSimdFloat = AvxFloat; #elif defined(__SSE4_1__) using DefaultSimdFloat = SseFloat; #elif __has_include("hwy/highway.h") using DefaultSimdFloat = HwyFloat; #else using DefaultSimdFloat = float; #endif } // namespace fast_math } // namespace dispenso ================================================ FILE: dispenso/for_each.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * @file for_each.h * @ingroup group_parallel * Functions for performing parallel for_each over iterables. This intends to more-or-less * mimic std::for_each, with a possible (Concurrent)TaskSet passed in for external wait capability, * and ForEachOptions for controlling the wait behavior and limiting of parallelism. **/ #pragma once #include #include #include #include #include namespace dispenso { #if DISPENSO_HAS_CONCEPTS /** * @concept ForEachFunc * @brief A callable suitable for for_each operations. * * The callable must be invocable with a reference to the iterator's value type. **/ template concept ForEachFunc = std::invocable())>; #endif // DISPENSO_HAS_CONCEPTS /** * A set of options to control for_each **/ struct ForEachOptions { /** * The maximum number of threads to use. This can be used to limit the number of threads below * the number associated with the TaskSet's thread pool to control the degree of concurrency. * Setting maxThreads to zero or one will result in serial operation. **/ uint32_t maxThreads = std::numeric_limits::max(); /** * Specify whether the return of the for_each signifies the work is complete. If the * for_each is initiated without providing a TaskSet, the for_each will always wait. * * @note If wait is true, the calling thread will always participate in computation. If this is * not desired, pass wait as false, and wait manually outside of the for_each on the passed * TaskSet. **/ bool wait = true; }; namespace detail { // Random-access iterators: compute chunk boundaries arithmetically, avoiding SmallVector. template void for_each_n_schedule( TaskSetT& tasks, Iter start, F&& f, ssize_t numThreads, size_t chunkSize, ssize_t transitionIdx, size_t smallChunkSize, const ForEachOptions& options, std::random_access_iterator_tag) { ssize_t numToSchedule = options.wait ? numThreads - 1 : numThreads; if (numToSchedule > 0) { tasks.scheduleBulk( static_cast(numToSchedule), [start, &f, chunkSize, smallChunkSize, transitionIdx](size_t idx) { ssize_t sidx = static_cast(idx); ssize_t offset; ssize_t thisChunkSize; if (sidx < transitionIdx) { offset = sidx * static_cast(chunkSize); thisChunkSize = static_cast(chunkSize); } else { offset = transitionIdx * static_cast(chunkSize) + (sidx - transitionIdx) * static_cast(smallChunkSize); thisChunkSize = static_cast(smallChunkSize); } Iter s = start + offset; Iter e = s + thisChunkSize; return [s, e, f]() { auto recurseInfo = PerPoolPerThreadInfo::parForRecurse(); for (Iter it = s; it != e; ++it) { f(*it); } }; }); } if (options.wait) { ssize_t lastIdx = numThreads - 1; ssize_t offset; ssize_t thisChunkSize; if (lastIdx < transitionIdx) { offset = lastIdx * static_cast(chunkSize); thisChunkSize = static_cast(chunkSize); } else { offset = transitionIdx * static_cast(chunkSize) + (lastIdx - transitionIdx) * static_cast(smallChunkSize); thisChunkSize = static_cast(smallChunkSize); } Iter lastStart = start + offset; Iter lastEnd = lastStart + thisChunkSize; for (Iter it = lastStart; it != lastEnd; ++it) { f(*it); } tasks.wait(); } } // Non-random-access iterators: pre-compute boundary iterators into SmallVector. template void for_each_n_schedule( TaskSetT& tasks, Iter start, F&& f, ssize_t numThreads, size_t chunkSize, ssize_t transitionIdx, size_t smallChunkSize, const ForEachOptions& options, IterCategory) { SmallVector boundaries; boundaries.reserve(static_cast(numThreads) + 1); boundaries.push_back(start); for (ssize_t t = 0; t < numThreads; ++t) { size_t cs = (t < transitionIdx) ? chunkSize : smallChunkSize; Iter next = boundaries[t]; std::advance(next, static_cast(cs)); boundaries.push_back(next); } ssize_t numToSchedule = options.wait ? numThreads - 1 : numThreads; if (numToSchedule > 0) { tasks.scheduleBulk(static_cast(numToSchedule), [&boundaries, &f](size_t idx) { return [s = boundaries[idx], e = boundaries[idx + 1], f]() { auto recurseInfo = PerPoolPerThreadInfo::parForRecurse(); for (Iter it = s; it != e; ++it) { f(*it); } }; }); } if (options.wait) { for (Iter it = boundaries[numThreads - 1]; it != boundaries[numThreads]; ++it) { f(*it); } tasks.wait(); } } } // namespace detail /** * A function like std::for_each_n, but where the function is invoked in parallel across the passed * range. * * @param tasks The task set to schedule the for_each on. * @param start The iterator for the start of the range. * @param n The length of the range. * @param f The function to execute in parallel. This is a unary function that must be capable of * taking dereference of Iter. * @param options See ForEachOptions for details. **/ template DISPENSO_REQUIRES(ForEachFunc) void for_each_n(TaskSetT& tasks, Iter start, size_t n, F&& f, ForEachOptions options = {}) { // TODO(bbudge): With options.maxThreads, we might want to allow a small fanout factor in // recursive case? if (!n || !options.maxThreads || detail::PerPoolPerThreadInfo::isParForRecursive(&tasks.pool())) { for (size_t i = 0; i < n; ++i) { f(*start); ++start; } if (options.wait) { tasks.wait(); } return; } // 0 indicates serial execution per API spec int32_t maxThreads = std::max(options.maxThreads, 1); ssize_t numThreads = std::min(tasks.numPoolThreads() + options.wait, maxThreads); // Reduce threads used if they exceed work to be done. numThreads = std::min(numThreads, n); auto chunking = detail::staticChunkSize(n, numThreads); size_t chunkSize = chunking.ceilChunkSize; bool perfectlyChunked = chunking.transitionTaskIndex == numThreads; ssize_t transitionIdx = chunking.transitionTaskIndex; size_t smallChunkSize = chunkSize - !perfectlyChunked; detail::for_each_n_schedule( tasks, start, std::forward(f), numThreads, chunkSize, transitionIdx, smallChunkSize, options, typename std::iterator_traits::iterator_category{}); } /** * A function like std::for_each_n, but where the function is invoked in parallel across the passed * range. * * @param start The iterator for the start of the range. * @param n The length of the range. * @param f The function to execute in parallel. This is a unary function that must be capable of * taking dereference of Iter. * @param options See ForEachOptions for details; however it should be noted that this function must * always wait, and therefore options.wait is ignored. **/ template DISPENSO_REQUIRES(ForEachFunc) void for_each_n(Iter start, size_t n, F&& f, ForEachOptions options = {}) { TaskSet taskSet(globalThreadPool()); options.wait = true; for_each_n(taskSet, start, n, std::forward(f), options); } /** * A function like std::for_each, but where the function is invoked in parallel across the passed * range. * * @param tasks The task set to schedule the for_each on. * @param start The iterator for the start of the range. * @param end The iterator for the end of the range. * @param f The function to execute in parallel. This is a unary function that must be capable of * taking dereference of Iter. * @param options See ForEachOptions for details. **/ template DISPENSO_REQUIRES(ForEachFunc) void for_each(TaskSetT& tasks, Iter start, Iter end, F&& f, ForEachOptions options = {}) { for_each_n(tasks, start, std::distance(start, end), std::forward(f), options); } /** * A function like std::for_each, but where the function is invoked in parallel across the passed * range. * * @param start The iterator for the start of the range. * @param end The iterator for the end of the range. * @param f The function to execute in parallel. This is a unary function that must be capable of * taking dereference of Iter. * @param options See ForEachOptions for details; however it should be noted that this function must * always wait, and therefore options.wait is ignored. **/ template DISPENSO_REQUIRES(ForEachFunc) void for_each(Iter start, Iter end, F&& f, ForEachOptions options = {}) { for_each_n(start, std::distance(start, end), std::forward(f), options); } // TODO(bbudge): Implement ranges versions for these in C++20 (currently not in an env where this // can be tested) } // namespace dispenso ================================================ FILE: dispenso/future.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * @file future.h * @ingroup group_async * A file providing Futures, as well as functionality for creating futures, and for how to invoke * Futures' closures. * * See https://en.cppreference.com/w/cpp/experimental/future for details on the API. **/ #pragma once #include #include #include #include #include namespace dispenso { // See https://en.cppreference.com/w/cpp/experimental/future for details on the API. // TODO(bbudge): Implement when_any(), ?unwrapping constructor? functionality. /** * A std::launch policy specifying we won't force asynchronicity. Opposite of * std::launch::async **/ constexpr std::launch kNotAsync = static_cast(0); /** * A std::launch policy specifying we won't allow Future::wait_for and * Future::wait_util to invoke the Future's underlying functor. Opposite of * std::launch::deferred **/ constexpr std::launch kNotDeferred = static_cast(0); /** * A class that implements a hybrid of the interfaces for std::experimental::future, and * std::experimental::shared_future. Future acts like a shared_future, * but includes the share function in order to enable generic code that may use the * function. See https://en.cppreference.com/w/cpp/experimental/future for more details on the API, * but some differences are notable. * * 1. dispenso::Future are created and set into executors through their constructor, or * through dispenso::async * 2. dispenso::future can be shared around like * std::experimental::shared_future, and wait/get functions may be called * concurrently from multiple threads. * * Future is thread-safe to call into, with the usual caveats (one thread may not be calling * anything on the Future while another thread assigns to it or destructs it). It is thread-safe to * assign or destruct on one copy of a Future while making calls on another copy of a Future * with the same backing state. * * Because Future is designed to work well with the dispenso TaskSet and ThreadPool, Future * aggressively work steals in get and wait functions. This prevents * deadlock due to thread pool resource starvation, similar to how TaskSets avoid that flavor of * deadlock. It is important to note that deadlock is still possible due to traditional cyclic * dependency, e.g. Future A and Future B which wait on each other. Just as with a cyclic mutex * locking requirement, cyclic Future waits are considered programmer error. **/ template class Future : detail::FutureBase { using Base = detail::FutureBase; public: /** * Construct an invalid Future. **/ Future() noexcept : Base() {} /** * Move constructor * * @param f The future to move from. **/ Future(Future&& f) noexcept : Base(std::move(f)) {} /** @copydoc Future(Future&&) */ Future(Base&& f) noexcept : Base(std::move(f)) {} /** * Copy construct a Future. * * @param f The existing future to reference. This essentially increments the reference count on * the future's backing state. **/ Future(const Future& f) noexcept : Base(f) {} /** @copydoc Future(const Future&) */ Future(const Base& f) noexcept : Base(f) {} /** * Construct a Future with callable and a schedulable (e.g. a ThreadPool or TaskSet), and ensure * it is scheduled according to the launch policy. * * @param f a functor with signature Result(void) to be invoked in order for * get to return a valid value. * @param schedulable A TaskSet, ThreadPool, or similar type that has * function schedule that takes a functor with signature void() and * ForceQueuingTag. * @param asyncPolicy If std::launch::async, the functor will be scheduled through to * the underlying thread pool work queue. * @param deferredPolicy If std::launch::deferred, wait_for and * wait_until may invoke the functor from their calling thread. **/ template Future( F&& f, Schedulable& schedulable, std::launch asyncPolicy = kNotAsync, std::launch deferredPolicy = std::launch::deferred) : Base(std::forward(f), schedulable, asyncPolicy, deferredPolicy) {} /** * Move a Future * * @param f The Future whose backing state will be transferred to this Future. **/ Future& operator=(Future&& f) noexcept { Base::move(reinterpret_cast(f)); return *this; } /** * Copy a Future, which increments the underlying state reference count. * * @param f The Future whose backing state will be referenced by this Future. **/ Future& operator=(const Future& f) { Base::copy(f); return *this; } /** * Destruct a Future. This decrements the shared reference count (if any), and ensures the * backing state is destroyed when no more references exist to the backing state. **/ ~Future() = default; /** * Is this Future valid? * * @return true if the Future was constructed with a functor/schedulable, with result * value, or indirectly from a Future that was constructed one of those ways. false * if the Future was constructed via the default constructor, or indrectly from a Future that was * constructed that way. **/ bool valid() const noexcept { return Base::valid(); } /** * Is the Future ready? * * @return true if the value associated with this Future has already be computed, and * get can return the value immediately. Returns false if the functor * is logically queued, or is in progress. **/ bool is_ready() const { return Base::is_ready(); } /** * Wait until is_ready is true. * * @note This function will invoke the functor if it is still logically queued. **/ void wait() const { Base::wait(); } /** * Wait until is_ready is true or until timing out. * * @param timeoutDuration The length of time to wait until timing out. This is relative to the * current point in time. * @return std::future_status::ready if get can return immediately, * std::future_status::timeout if the function timed out while waiting. * * @note This function may invoke the functor if it is still logically queued, and * std::launch::deferred was specified at construction. **/ template std::future_status wait_for(const std::chrono::duration& timeoutDuration) const { return Base::wait_for(timeoutDuration); } /** * Wait until is_ready is true or until timing out. * * @param timeoutTime The absolute time to wait until timing out. * @return std::future_status::ready if get can return immediately, * std::future_status::timeout if the function timed out while waiting. * * @note This function may invoke the functor if it is still logically queued, and * std::launch::deferred was specified at construction. **/ template std::future_status wait_until(const std::chrono::time_point& timeoutTime) const { return Base::wait_until(timeoutTime); } /** * Provide a shared future. share is here only to provide compatible api with * std::experimental::future, but Future already works like std::shared_future. **/ Future share() { return std::move(*this); } /** * Get the result of the future functor. This function blocks until the result is ready. * * @return A const reference to the result's value. **/ const Result& get() const { wait(); return this->impl_->result(); } /** * Schedule a functor to be invoked upon reaching is_ready status, and return * a future that will hold the result of the functor. * * @param f The functor to be executed whose result will be available in the returned * Future. This should have signature Unpecified(Future&&). * @param sched The Schedulable in which to run the functor. * @param asyncPolicy if std::launch::async then the functor will attempt to be * queued on the backing work queue of the Schedulable (if any). * @param deferredPolicy if std::launch::deferred, wait_for and * wait_until of the returned future will be allowed to invoke the functor, otherwise * not. * * @return A future containing the result of the functor. **/ template Future&&>> then( F&& f, Schedulable& sched, std::launch asyncPolicy = kNotAsync, std::launch deferredPolicy = std::launch::deferred) { Future&&>> retFuture; retFuture.impl_ = this->template thenImpl&&>>( std::forward(f), sched, asyncPolicy, deferredPolicy); return retFuture; } template Future&&>> then(F&& f) { return then(std::forward(f), globalThreadPool(), kNotAsync, std::launch::deferred); } private: template Future(T&& t, detail::ReadyTag) { this->impl_ = detail::createValueFutureImplReady(std::forward(t)); } template friend Future> make_ready_future(T&& t); template friend class Future; }; /** * @copydoc Future */ template class Future : detail::FutureBase { using Base = detail::FutureBase; public: /** Default constructor. */ Future() noexcept : Base() {} /** Move constructor. */ Future(Future&& f) noexcept : Base(std::move(f)) {} /** @copydoc Future(Future&&) */ Future(Base&& f) noexcept : Base(std::move(f)) {} /** Copy constructor. */ Future(const Future& f) noexcept : Base(f) {} /** @copydoc Future(const Future&) */ Future(const Base& f) noexcept : Base(f) {} /** Construct with callable and schedulable. @see Future::Future(F&&, Schedulable&, * std::launch, std::launch) */ template Future( F&& f, Schedulable& schedulable, std::launch asyncPolicy = kNotAsync, std::launch deferredPolicy = std::launch::deferred) : Base(std::forward(f), schedulable, asyncPolicy, deferredPolicy) {} Future& operator=(Future&& f) noexcept { Base::move(reinterpret_cast(f)); return *this; } Future& operator=(const Future& f) { Base::copy(f); return *this; } ~Future() = default; using Base::is_ready; using Base::valid; using Base::wait; using Base::wait_for; using Base::wait_until; Future share() { return std::move(*this); } /** * Get the result of the future functor. This function blocks until the result is ready. * * @return Access to the underlying reference. **/ Result& get() const { wait(); return this->impl_->result(); } template Future&&>> then( F&& f, Schedulable& sched, std::launch asyncPolicy = kNotAsync, std::launch deferredPolicy = std::launch::deferred) { Future&&>> retFuture; retFuture.impl_ = this->template thenImpl&&>>( std::forward(f), sched, asyncPolicy, deferredPolicy); return retFuture; } template Future&&>> then(F&& f) { return then(std::forward(f), globalThreadPool(), kNotAsync, std::launch::deferred); } private: template Future(std::reference_wrapper t, detail::ReadyTag) { this->impl_ = detail::createRefFutureImplReady(t); } template friend Future make_ready_future(std::reference_wrapper x); template friend class Future; }; /** * @copydoc Future */ template <> class Future : detail::FutureBase { using Base = detail::FutureBase; public: /** Default constructor. */ Future() noexcept : Base() {} /** Move constructor. */ Future(Future&& f) noexcept : Base(std::move(f)) {} /** @copydoc Future(Future&&) */ Future(Base&& f) noexcept : Base(std::move(f)) {} /** Copy constructor. */ Future(const Future& f) noexcept : Base(f) {} /** @copydoc Future(const Future&) */ Future(const Base& f) noexcept : Base(f) {} /** Construct with callable and schedulable. @see Future::Future(F&&, Schedulable&, * std::launch, std::launch) */ template DISPENSO_REQUIRES(OnceCallableFunc) Future( F&& f, Schedulable& schedulable, std::launch asyncPolicy = kNotAsync, std::launch deferredPolicy = std::launch::deferred) : Base(std::forward(f), schedulable, asyncPolicy, deferredPolicy) {} Future& operator=(Future&& f) noexcept { Base::move(reinterpret_cast(f)); return *this; } Future& operator=(const Future& f) { Base::copy(f); return *this; } ~Future() = default; using Base::is_ready; using Base::valid; using Base::wait; using Base::wait_for; using Base::wait_until; Future share() { return std::move(*this); } /** * Block until the functor has been called. **/ void get() const { wait(); this->impl_->result(); } template Future&&>> then( F&& f, Schedulable& sched, std::launch asyncPolicy = kNotAsync, std::launch deferredPolicy = std::launch::deferred) { Future&&>> retFuture; retFuture.impl_ = this->template thenImpl&&>>( std::forward(f), sched, asyncPolicy, deferredPolicy); return retFuture; } template Future&&>> then(F&& f) { return then(std::forward(f), globalThreadPool(), kNotAsync, std::launch::deferred); } private: Future(detail::ReadyTag) { impl_ = detail::createVoidFutureImplReady(); } friend Future make_ready_future(); template friend class Future; }; // TODO(bbudge): Determine if we should // a. Expand launch policies, and logically inherit from std::launch and // b. Whether async should truly mean on a new thread. // For now we will treat std::launch::async such that we pass ForceQueuingTag /** * Invoke a functor through the global dispenso thread pool. * * @param policy The bitmask policy for when/how the functor can be invoked. * std::launch::async will result in the functor being forced onto a ThreadPool work * queue. std::launch::deferred specifies that Future::wait_for and * Future::wait_until may invoke the functor. * @param f The functor to be passed, or a function to be executed * @param args The remaining arguments that will be passed to f **/ template inline Future> async(std::launch policy, F&& f, Args&&... args) { return Future>( std::bind(std::forward(f), std::forward(args)...), globalThreadPool(), policy); } /** * Invoke a functor through the global dispenso thread pool. * * @param f The functor to be passed, or a function to be executed * @param args The remaining arguments that will be passed to f **/ template inline Future> async(F&& f, Args&&... args) { return ::dispenso::async(std::launch::deferred, std::forward(f), std::forward(args)...); } /** * Invoke a functor through the specified dispenso thread pool. * * @param pool The ThreadPool to run the Future. * @param policy The bitmask policy for when/how the functor can be invoked. * std::launch::async will result in the functor being forced onto a ThreadPool work * queue. std::launch::deferred specifies that Future::wait_for and * Future::wait_until may invoke the functor. * @param f The functor to be passed, or a function to be executed * @param args The remaining arguments that will be passed to f **/ template inline Future> async(ThreadPool& pool, std::launch policy, F&& f, Args&&... args) { return Future>( std::bind(std::forward(f), std::forward(args)...), pool, policy); } /** * Invoke a functor through the specified dispenso thread pool. * * @param pool The ThreadPool to run the Future. * @param f The functor to be passed, or a function to be executed * @param args The remaining arguments that will be passed to f **/ template inline Future> async(ThreadPool& pool, F&& f, Args&&... args) { return ::dispenso::async( pool, std::launch::deferred, std::forward(f), std::forward(args)...); } /** * Invoke a functor through the specified dispenso TaskSet. * * @param tasks The TaskSet to run the Future. * @param policy The bitmask policy for when/how the functor can be invoked. * std::launch::async will result in the functor being forced onto a ThreadPool work * queue. std::launch::deferred specifies that Future::wait_for and * Future::wait_until may invoke the functor. * @param f The functor to be passed, or a function to be executed * @param args The remaining arguments that will be passed to f **/ template inline Future> async(TaskSet& tasks, std::launch policy, F&& f, Args&&... args) { return Future>( std::bind(std::forward(f), std::forward(args)...), tasks, policy); } /** * Invoke a functor through the specified dispenso TaskSet. * * @param tasks The TaskSet to run the Future. * @param f The functor to be passed, or a function to be executed * @param args The remaining arguments that will be passed to f **/ template inline Future> async(TaskSet& tasks, F&& f, Args&&... args) { return ::dispenso::async( tasks, std::launch::deferred, std::forward(f), std::forward(args)...); } /** * Invoke a functor through the specified dispenso ConcurrentTaskSet. * * @param tasks The ConcurrentTaskSet to run the Future. * @param policy The bitmask policy for when/how the functor can be invoked. * std::launch::async will result in the functor being forced onto a ThreadPool work * queue. std::launch::deferred specifies that Future::wait_for and * Future::wait_until may invoke the functor. * @param f The functor to be passed, or a function to be executed * @param args The remaining arguments that will be passed to f **/ template inline Future> async(ConcurrentTaskSet& tasks, std::launch policy, F&& f, Args&&... args) { return Future>( std::bind(std::forward(f), std::forward(args)...), tasks, policy); } /** * Invoke a functor through the specified dispenso ConcurrentTaskSet. * * @param tasks The ConcurrentTaskSet to run the Future. * @param f The functor to be passed, or a function to be executed * @param args The remaining arguments that will be passed to f **/ template inline Future> async(ConcurrentTaskSet& tasks, F&& f, Args&&... args) { return ::dispenso::async( tasks, std::launch::deferred, std::forward(f), std::forward(args)...); } /** * Invoke a functor on a new thread. * * @param sched A NewThreadInvoker * @param policy The bitmask policy for when/how the functor can be invoked. * std::launch::async will result in the functor being forced onto a ThreadPool work * queue. std::launch::deferred specifies that Future::wait_for and * Future::wait_until may invoke the functor. * @param f The functor to be passed, or a function to be executed * @param args The remaining arguments that will be passed to f **/ template inline Future> async(NewThreadInvoker sched, std::launch policy, F&& f, Args&&... args) { return Future>( std::bind(std::forward(f), std::forward(args)...), sched, policy); } /** * Invoke a functor on a new thread. * * @param sched A NewThreadInvoker. * @param f The functor to be passed, or a function to be executed * @param args The remaining arguments that will be passed to f **/ template inline Future> async(NewThreadInvoker sched, F&& f, Args&&... args) { return ::dispenso::async( sched, std::launch::deferred, std::forward(f), std::forward(args)...); } /** * Make a Future in a ready state with the value passed into * make_ready_future * * @param t the value to use to create the returned future. **/ template inline Future> make_ready_future(T&& t) { return Future>(std::forward(t), detail::ReadyTag()); } /** * Make a Future in a ready state with the reference_wrapper passed into * make_ready_future * * @param x the wrapped reference to use to create the returned future. **/ template inline Future make_ready_future(std::reference_wrapper x) { return Future(x, detail::ReadyTag()); } /** * Make a Future in a ready state. * **/ inline Future make_ready_future() { return Future(detail::ReadyTag()); } /** * Take a collection of futures, and return a future which will be ready when all input futures are * ready. * * @param first An iterator to the start of the future collection. * @param last An iterator to the end of the future collection. * * @return A Future containing a vector holding copies of the input Futures. The returned Future * will be in ready state when all input Futures are ready. * **/ template Future::value_type>> when_all( InputIt first, InputIt last); /** * Take a specific set of futures, and return a future which will be ready when all input futures *are ready. * * @param futures A parameter pack of futures. * * @return A Future containing a tuple holding copies of the input Futures. The returned Future * will be in ready state when all input Futures are ready. * **/ template auto when_all(Futures&&... futures) -> Future...>>; /** * Take a collection of futures, and return a future which will be ready when all input futures are * ready. * * @param taskSet A task set to register with such that after this call, * taskSet::wait() implies that the resultant future is_ready() * @param first An iterator to the start of the future collection. * @param last An iterator to the end of the future collection. * * @return A Future containing a vector holding copies of the input Futures. The returned Future * will be in ready state when all input Futures are ready. * **/ template Future::value_type>> when_all(TaskSet& taskSet, InputIt first, InputIt last); /** @overload */ template Future::value_type>> when_all(ConcurrentTaskSet& taskSet, InputIt first, InputIt last); /** * Take a specific set of futures, and return a future which will be ready when all input futures *are ready. * * @param taskSet A task set to register with such that after this call, * taskSet::wait() implies that the resultant future is_ready() * @param futures A parameter pack of futures. * * @return A Future containing a tuple holding copies of the input Futures. The returned Future * will be in ready state when all input Futures are ready. * **/ template auto when_all(TaskSet& taskSet, Futures&&... futures) -> Future...>>; /** @overload */ template auto when_all(ConcurrentTaskSet& taskSet, Futures&&... futures) -> Future...>>; } // namespace dispenso #include ================================================ FILE: dispenso/graph.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include namespace { constexpr size_t kToDelete = std::numeric_limits::max(); void set_union( std::vector& s1, const std::vector& s2) { std::vector tmp(s1); s1.clear(); std::set_union(tmp.cbegin(), tmp.cend(), s2.cbegin(), s2.cend(), std::back_inserter(s1)); } void set_insert(std::vector& s, const dispenso::BiPropNode* node) { auto it = std::upper_bound(s.begin(), s.end(), node); if (it == s.begin() || *(it - 1) != node) { s.insert(it, node); } } } // anonymous namespace namespace dispenso { void BiPropNode::biPropDependsOnOneNode(BiPropNode& node) { Node::dependsOnOneNode(node); if (node.biPropSet_ == nullptr && biPropSet_ == nullptr) { biPropSet_ = std::make_shared>(); set_insert(*biPropSet_, this); set_insert(*biPropSet_, &node); node.biPropSet_ = biPropSet_; } else if (node.biPropSet_ != nullptr && biPropSet_ != nullptr) { set_union(*biPropSet_, *node.biPropSet_); node.biPropSet_ = biPropSet_; } else if (biPropSet_ == nullptr) { biPropSet_ = node.biPropSet_; set_insert(*biPropSet_, this); } else { node.biPropSet_ = biPropSet_; set_insert(*biPropSet_, &node); } } template void SubgraphT::clear() { decrementDependentCounters(); const size_t numGraphPredecessors = markNodesWithPredicessors(); if (numGraphPredecessors != 0) { removePredecessorDependencies(numGraphPredecessors); } destroyNodes(); } template void SubgraphT::destroyNodes() { for (NodeType* n : nodes_) { n->~NodeType(); } allocator_->clear(); nodes_.clear(); } template SubgraphT::~SubgraphT() { for (NodeType* n : nodes_) { n->~NodeType(); } } template void SubgraphT::decrementDependentCounters() { for (N* node : nodes_) { for (Node* const dependent : node->dependents_) { dependent->numPredecessors_--; } removeNodeFromBiPropSet(node); } } template size_t SubgraphT::markNodesWithPredicessors() { size_t numGraphPredecessors = 0; for (N* node : nodes_) { if (node->numPredecessors_ != 0) { numGraphPredecessors += node->numPredecessors_; node->numPredecessors_ = kToDelete; } } return numGraphPredecessors; } template void SubgraphT::removePredecessorDependencies(size_t numGraphPredecessors) { for (SubgraphT& subgraph : graph_->subgraphs_) { if (&subgraph == this) { continue; } for (N* node : subgraph.nodes_) { auto& dependents = node->dependents_; size_t num = dependents.size(); for (size_t i = 0; i < num;) { if (dependents[i]->numPredecessors_ == kToDelete) { dependents[i] = dependents[num - 1]; --num; if (--numGraphPredecessors == 0) { dependents.resize(num); return; } } else { i++; } } dependents.resize(num); } } } namespace { constexpr size_t kMaxCache = 8; // Don't cache too-large allocators. This way we will have at most 8*(2**16) = 512K outstanding // nodes worth of memory per node type. // TODO(bbudge): Make these caching values macro configurable for lightweight platforms. constexpr size_t kMaxChunkCapacity = 1 << 16; using AlignedNodePoolPtr = std::unique_ptr>; std::vector g_sgcache[2]; std::mutex g_sgcacheMtx; template constexpr size_t kCacheIndex = size_t{std::is_same::value}; } // namespace template typename SubgraphT::PoolPtr SubgraphT::getAllocator() { AlignedNodePoolPtr ptr; auto& cache = g_sgcache[kCacheIndex]; { std::lock_guard lk(g_sgcacheMtx); if (cache.empty()) { void* alloc = detail::alignedMalloc(sizeof(NoLockPoolAllocator), alignof(NoLockPoolAllocator)); auto* pool = new (alloc) NoLockPoolAllocator(sizeof(NodeType), 128 * sizeof(NodeType), ::malloc, ::free); ptr.reset(pool); } else { ptr = std::move(cache.back()); ptr->clear(); cache.pop_back(); } } return PoolPtr(ptr.release(), releaseAllocator); } template void SubgraphT::releaseAllocator(NoLockPoolAllocator* ptr) { if (!ptr) { return; } if (ptr->totalChunkCapacity() < kMaxChunkCapacity) { auto& cache = g_sgcache[kCacheIndex]; { std::lock_guard lk(g_sgcacheMtx); if (cache.size() < kMaxCache) { cache.emplace_back(ptr); return; } } } detail::AlignedFreeDeleter()(ptr); } template GraphT::GraphT(GraphT&& other) : subgraphs_(std::move(other.subgraphs_)) { for (SubgraphT& subgraph : subgraphs_) { subgraph.graph_ = this; } } template GraphT& GraphT::operator=(GraphT&& other) noexcept { subgraphs_ = std::move(other.subgraphs_); for (SubgraphT& subgraph : subgraphs_) { subgraph.graph_ = this; } return *this; } template SubgraphT& GraphT::addSubgraph() { subgraphs_.push_back(SubgraphType(this)); return subgraphs_.back(); } template class DISPENSO_DLL_ACCESS SubgraphT; template class DISPENSO_DLL_ACCESS SubgraphT; template class DISPENSO_DLL_ACCESS GraphT; template class DISPENSO_DLL_ACCESS GraphT; } // namespace dispenso ================================================ FILE: dispenso/graph.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * @file graph.h * @ingroup group_graph * A file providing task graph functionality for expressing complex task dependencies. **/ #pragma once #include #include #include #include #include #include #include #include #include #include #include /** * @def DISPENSO_GRAPH_DEPENDENTS_INLINE_SIZE * @brief Number of dependent node pointers to store inline in each Node. * * Most graph nodes have few dependents (1-4 is common). This value controls * how many dependent pointers are stored inline before heap allocation. * Larger values reduce heap allocations but increase Node size. * Define before including graph.h to customize. **/ #ifndef DISPENSO_GRAPH_DEPENDENTS_INLINE_SIZE #define DISPENSO_GRAPH_DEPENDENTS_INLINE_SIZE 4 #endif static_assert( DISPENSO_GRAPH_DEPENDENTS_INLINE_SIZE >= 1 && DISPENSO_GRAPH_DEPENDENTS_INLINE_SIZE <= 64, "DISPENSO_GRAPH_DEPENDENTS_INLINE_SIZE must be between 1 and 64"); /* Terminology -------------------------------------------------------------------------------- The Node depends on the predecessor. The dependent depends on the node. ~~~ ┌─────────────┐ ┌──────┐ ┌───────────┐ │ predecessor │ ──▶ │ node │ ──▶ │ dependent │ └─────────────┘ └──────┘ └───────────┘ ~~~ Graph construction -------------------------------------------------------------------------------- The Graph class can be used to created tasks with dependencies and execute it once. Graphs must not contain cycles. Example: ~~~ // // ┌────────────┐ ┌───────────────┐ ┌───────────────────┐ // │ 0: r[0]+=1 │ ──▶ │ 1: r[1]=r[0]*2│ ──▶ │ 4: r[4]=r[1]+r[3] │ // └────────────┘ └───────────────┘ └───────────────────┘ // ▲ // ┌────────────┐ ┌───────────────┐ │ // │ 2: r[2]+=8 │ ──▶ │ 3: r[3]=r[2]/2│ ──────┘ // └────────────┘ └───────────────┘ std::array r; dispenso::Graph graph; dispenso::Node& N0 = graph.addNode([&]() { r[0] += 1; }); dispenso::Node& N2 = graph.addNode([&]() { r[2] += 8; }); dispenso::Node& N1 = graph.addNode([&]() { r[1] += r[0] * 2; }); dispenso::Node& N3 = graph.addNode([&]() { r[3] += r[2] / 2; }); dispenso::Node& N4 = graph.addNode([&]() { r[4] += r[1] + r[3]; }); N4.dependsOn(N1, N3); N1.dependsOn(N0); N3.dependsOn(N2); dispenso::TaskSet taskSet(dispenso::globalThreadPool()); dispenso::ParallelForExecutor parallelForExecutor; parallelForExecutor(taskSet, graph); ~~~ Partial revaluation -------------------------------------------------------------------------------- If graph is big or we need to recompute graph partially we can execute it again. After execution of the graph all nodes change their state from "incomplete" to "completed". If order to evaluate whole graph again we can use function `setAllNodesIncomplete` Example: ~~~ r = {0, 0, 0, 0, 0}; setAllNodesIncomplete(graph); parallelForExecutor(taskSet, graph); ~~~ The graph can be recomputed partially if we have new input data for one or several nodes in the graph. It order to do it we need to call `setIncomplete()` method for every node which we need to recompute and after use functor `ForwardPropagator` to mark as "incomplete" all dependents. Example: ~~~ N1.setIncomplete(); r[1] = r[4] = 0; ForwardPropagator forwardPropagator; forwardPropagator(graph); evaluateGraph(graph); ~~~ In this exaple only node 1 and 4 will be invoked. Subgraphs -------------------------------------------------------------------------------- It is possible to organize nodes into subgraphs and destroy and recreate if we have static and dynamic parts of the computation graph Example: ~~~ // // ∙----subgraph1---∙ ∙---subgraph2-------∙ // ¦ ┌────────────┐ ¦ ¦ ┌───────────────┐ ¦ ┌───────────────────┐ // ¦ │ 0: r[0]+=1 │ ──▶ │ 1: r[1]=r[0]*2│ ──▶ │ 4: r[4]=r[1]+r[3] │ // ¦ └────────────┘ ¦ ¦ └───────────────┘ ¦ └───────────────────┘ // ¦ ¦ ¦ ¦ ▲ // ¦ ┌────────────┐ ¦ ¦ ┌───────────────┐ ¦ │ // ¦ │ 2: r[2]+=8 │ ──▶ │ 3: r[3]=r[2]/2│ ──────┘ // ¦ └────────────┘ ¦ ¦ └───────────────┘ ¦ // ∙----------------∙ ∙-------------------∙ std::array r; dispenso::Graph graph; dispenso::Subgraph& subgraph1 = graph.addSubgraph(); dispenso::Subgraph& subgraph2 = graph.addSubgraph(); dispenso::Node& N0 = subgraph1.addNode([&]() { r[0] += 1; }); dispenso::Node& N2 = subgraph1.addNode([&]() { r[2] += 8; }); dispenso::Node& N1 = subgraph2.addNode([&]() { r[1] += r[0] * 2; }); dispenso::Node& N3 = subgraph2.addNode([&]() { r[3] += r[2] / 2; }); dispenso::Node& N4 = graph.addNode([&]() { r[4] += r[1] + r[3]; }); N4.dependsOn(N1, N3); N1.dependsOn(N0); N3.dependsOn(N2); // evaluate graph first time r = {0, 0, 0, 0, 0}; dispenso::ConcurrentTaskSet concurrentTaskSet(dispenso::globalThreadPool()); dispenso::ConcurrentTaskSetExecutor concurrentTaskSetExecutor; concurrentTaskSetExecutor(concurrentTaskSet, graph); // disconnect and destroy nodes of subgraph2 // it invalidates node references/pointers of this subgraph subgraph2.clear(); // create another nodes dispenso::Node& newN1 = subgraph2.addNode([&]() { r[1] += r[0] * 20; }); dispenso::Node& newN3 = subgraph2.addNode([&]() { r[3] += r[2] / 20; }); newN1.dependsOn(N0); newN3.dependsOn(N2); N4.dependsOn(newN1, newN3); // and revaluae the graph setAllNodesIncomplete(movedGraph); concurrentTaskSetExecutor(concurrentTaskSet, graph); ~~~ Bidirectional propagation dependency -------------------------------------------------------------------------------- In certain scenarios, nodes may alter the same memory. In such instances, it becomes necessary to compute the predecessors of the node, even if they possess a "completed" state following state propagation. To facilitate this process automatically, we introduce the notion of a bidirectional propagation dependency (`BiProp`). Example: ~~~ // ┌─────────────────┐ // │ 3: m3+=b*b │ // └─────────────────┘ // ▲ // ┌−-----------−−−−−−−−−│−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−┐ // ╎ ┌───────────┐ ┌─────────────────┐ ┌────────────┐ ╎ // ╎ │ 0: b+=5 │ ──▷ │ 1: b*=5 │ ──▷ │ 2: b/=m4 │ ╎ // ╎ └───────────┘ └─────────────────┘ └────────────┘ ╎ // └−−−-−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−▲−−−−−−−−−−−−┘ // Legend: │ // ──▶ Normal dependency ┌────────────┐ // ──▷ Bidirectional propagation dependency │ 4: m4+=2 │ // m4 variable modified only in node 4 └────────────┘ float b, m3, m4 dispenso::BiPropGraph g; std::array N; dispenso::BiPropNode& N0 = g.addNode([&]() { b += 5; }); dispenso::BiPropNode& N1 = g.addNode([&]() { b *= 5; }); dispenso::BiPropNode& N2 = g.addNode([&]() { b /= m4; }); dispenso::BiPropNode& N3 = g.addNode([&]() { m3 += b*b; }); dispenso::BiPropNode& N4 = g.addNode([&]() { m4 += 2; }); N3.dependsOn(N1); N2.dependsOn(N4); N2.biPropDependsOn(N1); N1.biPropDependsOn(N0); // first execution b = m3 = m4 = 0.f; dispenso::ConcurrentTaskSet concurrentTaskSet(dispenso::globalThreadPool()); dispenso::ConcurrentTaskSetExecutor concurrentTaskSetExecutor; concurrentTaskSetExecutor(concurrentTaskSet, g); N[4].setIncomplete(); // if node 4 is incomplete after propagation node 2 become incomplete. Taking in account that node 2 // bidirectionally depends on nodes 0 and 1 they will be marked as incomplete as well b = m4 = 0.f; ForwardPropagator forwardPropagator; forwardPropagator(g); concurrentTaskSetExecutor(concurrentTaskSet, g); ~~~ Please read tests from `graph_test.cpp` for more examples. */ namespace detail { class ExecutorBase; template void callFunctor(void* ptr) { (*static_cast(ptr))(); } template void destroyFunctor(void* ptr) { static_cast(ptr)->~F(); constexpr size_t kFuncSize = static_cast(dispenso::detail::nextPow2(sizeof(F))); dispenso::deallocSmallBuffer(ptr); } } // namespace detail namespace dispenso { /** * Class to store task with dependencies **/ class Node { public: Node() = delete; Node(const Node&) = delete; Node& operator=(const Node&) = delete; /** Move constructor. */ Node(Node&& other) noexcept : numIncompletePredecessors_(other.numIncompletePredecessors_.load()), numPredecessors_(other.numPredecessors_), invoke_(other.invoke_), destroy_(other.destroy_), funcBuffer_(other.funcBuffer_), dependents_(std::move(other.dependents_)) { other.funcBuffer_ = nullptr; } ~Node() { if (funcBuffer_) { destroy_(funcBuffer_); } } /** * Make this node depends on nodes. This is not concurrency safe. * * @param nodes predecessors of the node **/ template inline void dependsOn(Ns&... nodes) { ((void)std::initializer_list{(dependsOnOneNode(nodes), 0)...}); } /** * Invoke the type-erased functor. Change competed state of the node to "Incomplete". * Concurrency safe. **/ inline void run() const { invoke_(funcBuffer_); numIncompletePredecessors_.store(kCompleted, std::memory_order_release); } /** * apply an func to each dependent of the node * * @param func a functor with signature void(const Node&) **/ template inline void forEachDependent(F&& func) const { for (const Node* dependent : dependents_) { func(*dependent); } } /** * apply an func to each dependent of the node This is not concurrency safe. * * @param func a functor with signature void(Node&) **/ template inline void forEachDependent(F&& func) { for (Node* dependent : dependents_) { func(*dependent); } } /** * Return the number of nodes this node depends on. Concurrency safe. **/ inline size_t numPredecessors() const { return numPredecessors_; } /** * Return true if node is completed. * New node always incomplete. If node was invoked it become completed. this * state can be changed by calling setIncomplete() * Concurrency safe. **/ inline bool isCompleted() const { return numIncompletePredecessors_.load(std::memory_order_relaxed) == kCompleted; } /** * Mark node incomplete. (that allows reevaluate this node again). * Concurrency safe. This methods should never be called concurrent to when * the graph execution is happening * * @return true if state was changed. **/ inline bool setIncomplete() const { if (numIncompletePredecessors_.load(std::memory_order_relaxed) == kCompleted) { numIncompletePredecessors_.store(0, std::memory_order_relaxed); return true; } return false; } /** * Mark node completed. * Concurrency safe. This methods should never be called concurrent to when * the graph execution is happening **/ inline void setCompleted() const { numIncompletePredecessors_.store(kCompleted, std::memory_order_relaxed); } protected: /** Construct a Node with a functor. @param f The functor to execute when the node runs. */ template ::value, void>> Node(F&& f) : numIncompletePredecessors_(0) { using FNoRef = typename std::remove_reference::type; constexpr size_t kFuncSize = static_cast(detail::nextPow2(sizeof(FNoRef))); funcBuffer_ = allocSmallBuffer(); new (funcBuffer_) FNoRef(std::forward(f)); invoke_ = ::detail::callFunctor; destroy_ = ::detail::destroyFunctor; } void dependsOnOneNode(Node& node) { node.dependents_.emplace_back(this); numPredecessors_++; } static constexpr size_t kCompleted = std::numeric_limits::max(); mutable std::atomic numIncompletePredecessors_; size_t numPredecessors_ = 0; private: using InvokerType = void (*)(void* ptr); InvokerType invoke_; InvokerType destroy_; char* funcBuffer_; SmallVector dependents_; // nodes depend on this template friend class SubgraphT; friend class ::detail::ExecutorBase; template friend void setAllNodesIncomplete(const G& graph); }; /** * Class to store task with dependencies. Support bidirectional propagation dependency between *nodes. **/ class BiPropNode : public Node { public: BiPropNode() = delete; BiPropNode(const BiPropNode&) = delete; BiPropNode& operator=(const BiPropNode&) = delete; /** Move constructor. */ BiPropNode(BiPropNode&& other) noexcept : Node(std::move(other)), biPropSet_(std::move(other.biPropSet_)) {} /** * Make this node depends on nodes. Create bidirectional propagation dependency. This is not *concurrency safe. * * @param nodes predecessors of the node **/ template inline void biPropDependsOn(Ns&... nodes) { ((void)std::initializer_list{(biPropDependsOnOneNode(nodes), 0)...}); } /** * Return true if node belongs to the same propogation set. (That means both nodes after * propogation become completed/incomplete together.) * * @param node to test **/ inline bool isSameSet(const BiPropNode& node) const { return biPropSet_ && biPropSet_ == node.biPropSet_; } private: template ::value, void>> BiPropNode(T&& f) : Node(std::forward(f)) {} inline void removeFromBiPropSet() { if (biPropSet_ != nullptr) { auto it = std::find(biPropSet_->begin(), biPropSet_->end(), this); if (it != biPropSet_->end()) { biPropSet_->erase(it); } } } DISPENSO_DLL_ACCESS void biPropDependsOnOneNode(BiPropNode& node); std::shared_ptr> biPropSet_; template friend class SubgraphT; friend class ::detail::ExecutorBase; }; template class GraphT; /** * A subgraph within a Graph, containing a collection of nodes that can be executed together. * * @tparam N The node type (Node or BiPropNode). */ template class DISPENSO_DLL_ACCESS SubgraphT { public: using NodeType = N; SubgraphT() = delete; SubgraphT(const SubgraphT&) = delete; SubgraphT& operator=(const SubgraphT&) = delete; /** Move constructor. */ SubgraphT(SubgraphT&& other) noexcept : graph_(other.graph_), nodes_(std::move(other.nodes_)), allocator_(std::move(other.allocator_)) {} ~SubgraphT(); /** * Construct a NodeType with a valid functor. This is not concurrency safe. * * @param f A functor with signature void(). * @return reference to the created node. **/ template DISPENSO_REQUIRES(OnceCallableFunc) N& addNode(T&& f) { nodes_.push_back(new (allocator_->alloc()) NodeType(std::forward(f))); return *nodes_.back(); } /** * Return number of nodes in subgraph. Concurrency safe. **/ size_t numNodes() const { return nodes_.size(); } /** * Reserve capacity for the specified number of nodes. * This can improve performance when the number of nodes is known in advance * by avoiding vector reallocations during addNode calls. * This is not concurrency safe. * * @param n - number of nodes to reserve capacity for **/ void reserve(size_t n) { nodes_.reserve(n); } /** * Return const reference to node with index. Concurrency safe. * * @param index - index of the node **/ const N& node(size_t index) const { return *nodes_[index]; } /** * Return reference to node with index. Concurrency safe. * * @param index - index of the node **/ N& node(size_t index) { return *nodes_[index]; } /** * apply an func to each node of the subgraph. Concurrency safe. * * @param func a functor with signature void(const Node&) **/ template inline void forEachNode(F&& func) const { for (const N* node : nodes_) { func(*node); } } /** * apply an func to each node of the subgraph. This is not concurrency safe. * This methods should never be called concurrent to when the graph execution is happening * * @param func a functor with signature void(Node&) **/ template inline void forEachNode(F&& func) { for (N* node : nodes_) { func(*node); } } /** * Removes all dependency between nodes of this subgraph and other nodes, destroy this subgraph * nodes. This is not concurrency safe. **/ void clear(); private: using DeallocFunc = void (*)(NoLockPoolAllocator*); using PoolPtr = std::unique_ptr; static constexpr size_t kNodeSizeP2 = static_cast(detail::nextPow2(sizeof(NodeType))); explicit SubgraphT(GraphT* graph) : graph_(graph), nodes_(), allocator_(getAllocator()) {} inline void removeNodeFromBiPropSet(Node* /* node */) {} void removeNodeFromBiPropSet(BiPropNode* node) { node->removeFromBiPropSet(); } void decrementDependentCounters(); size_t markNodesWithPredicessors(); void removePredecessorDependencies(size_t numGraphPredecessors); void destroyNodes(); static PoolPtr getAllocator(); static void releaseAllocator(NoLockPoolAllocator* ptr); GraphT* graph_; #if defined(_WIN32) && !defined(__MINGW32__) #pragma warning(push) #pragma warning(disable : 4251) #endif std::vector nodes_; PoolPtr allocator_; #if defined(_WIN32) && !defined(__MINGW32__) #pragma warning(pop) #endif template friend class GraphT; }; /** * A directed acyclic graph (DAG) for expressing task dependencies. * * GraphT manages a collection of subgraphs, each containing nodes that represent work to be done. * Nodes can have dependencies on other nodes, and the graph executor ensures nodes run only after * their dependencies complete. * * @tparam N The node type (Node or BiPropNode). */ template class DISPENSO_DLL_ACCESS GraphT { public: using NodeType = N; using SubgraphType = SubgraphT; GraphT(const GraphT&) = delete; GraphT& operator=(const GraphT&) = delete; /** * Create empty graph. **/ GraphT() { subgraphs_.push_back(SubgraphType(this)); } /** * Move constructor **/ GraphT(GraphT&& other); /** * Move assignment operator **/ GraphT& operator=(GraphT&& other) noexcept; /** * Construct a NodeType with a valid functor. This node is created into subgraph 0. * This is not concurrency safe. * * @param f A functor with signature void(). **/ template DISPENSO_REQUIRES(OnceCallableFunc) N& addNode(T&& f) { return subgraphs_[0].addNode(std::forward(f)); } /** * Return number of nodes in subgraph 0. Concurrency safe. **/ size_t numNodes() const { return subgraphs_[0].numNodes(); } /** * Return const reference to node with index in subgraph 0. Concurrency safe. * * @param index - index of the node **/ const N& node(size_t index) const { return subgraphs_[0].node(index); } /** * Return reference to node with index in subgraph 0. Concurrency safe. * * @param index - index of the node **/ N& node(size_t index) { return subgraphs_[0].node(index); } /** * Create an empty subgraph. This is not concurrency safe. **/ SubgraphT& addSubgraph(); /** * Return number of subgraphs in the graph including subgraph 0. Concurrency safe. **/ size_t numSubgraphs() const { return subgraphs_.size(); } /** * Return const reference to subgraph with index. Concurrency safe. * * @param index - index of the subgraph. **/ const SubgraphT& subgraph(size_t index) const { return subgraphs_[index]; } /** * Return reference to subgraph with index. Concurrency safe. * * @param index - index of the subgraph. **/ SubgraphT& subgraph(size_t index) { return subgraphs_[index]; } /** * apply an func to each subgraph in the graph. Concurrency safe. * * @param func a functor with signature void(const SubgraphT&) **/ template inline void forEachSubgraph(F&& func) const { for (const SubgraphT& subgraph : subgraphs_) { func(subgraph); } } /** * apply an func to each subgraph in the graph. Concurrency safe. * * @param func a functor with signature void(SubgraphT&) **/ template inline void forEachSubgraph(F&& func) { for (SubgraphT& subgraph : subgraphs_) { func(subgraph); } } /** * apply an func to each node in the graph including all nodes from all subgraphs. Concurrency * safe. * * @param func a functor with signature void(const Node&) **/ template inline void forEachNode(F&& func) const { for (const SubgraphT& subgraph : subgraphs_) { for (const N* node : subgraph.nodes_) { func(*node); } } } /** * apply an func to each node in the graph. Concurrency safe. * * @param func a functor with signature void(const Node&) **/ template inline void forEachNode(F&& func) { for (SubgraphT& subgraph : subgraphs_) { for (N* node : subgraph.nodes_) { func(*node); } } } /** * Destroy all nodes and subgraphs. This is not concurrency safe. **/ inline void clear() { subgraphs_.clear(); subgraphs_.push_back(SubgraphType(this)); } /** * Destroy all nodes. Keeps subgraphs. This is not concurrency safe. **/ inline void clearSubgraphs() { for (SubgraphT& subgraph : subgraphs_) { subgraph.destroyNodes(); } } private: static constexpr size_t kSubgraphSizeP2 = static_cast(detail::nextPow2(sizeof(SubgraphType))); #if defined(_WIN32) && !defined(__MINGW32__) #pragma warning(push) #pragma warning(disable : 4251) #endif std::deque> subgraphs_; #if defined(_WIN32) && !defined(__MINGW32__) #pragma warning(pop) #endif template friend class SubgraphT; }; /** A DAG for task scheduling with simple dependency tracking. */ using Graph = GraphT; /** A DAG for task scheduling with bidirectional dependency propagation. */ using BiPropGraph = GraphT; /** A subgraph within a Graph. */ using Subgraph = SubgraphT; /** A subgraph within a BiPropGraph. */ using BiPropSubgraph = SubgraphT; } // namespace dispenso ================================================ FILE: dispenso/graph_executor.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include namespace dispenso { template void SingleThreadExecutor::operator()(const G& graph) { using NodeType = typename G::NodeType; nodesToExecute_.clear(); nodesToExecuteNext_.clear(); // Count total nodes to estimate capacity needs size_t nodeCount = 0; graph.forEachNode([&](const NodeType& node) { ++nodeCount; if (hasNoIncompletePredecessors(node)) { nodesToExecute_.emplace_back(&node); } }); // Reserve capacity to avoid reallocations during execution // Use a fraction of total nodes as estimate for max wave size size_t estimatedWaveSize = std::max(nodesToExecute_.size(), nodeCount / 4); nodesToExecute_.reserve(estimatedWaveSize); nodesToExecuteNext_.reserve(estimatedWaveSize); while (!nodesToExecute_.empty()) { for (const Node* n : nodesToExecute_) { const NodeType* node = static_cast(n); node->run(); node->forEachDependent([&](const Node& d) { if (decNumIncompletePredecessors( static_cast(d), std::memory_order_relaxed)) { nodesToExecuteNext_.emplace_back(static_cast(&d)); } }); } nodesToExecute_.swap(nodesToExecuteNext_); nodesToExecuteNext_.clear(); } } template void ParallelForExecutor::operator()(TaskSetT& taskSet, const G& graph) { using NodeType = typename G::NodeType; nodesToExecute_.clear(); nodesToExecuteNext_.clear(); // Count total nodes to estimate capacity needs size_t nodeCount = 0; graph.forEachNode([&](const NodeType& node) { ++nodeCount; if (hasNoIncompletePredecessors(node)) { nodesToExecute_.emplace_back(&node); } }); // Reserve capacity to avoid reallocations during execution size_t estimatedWaveSize = std::max(nodesToExecute_.size(), nodeCount / 4); nodesToExecute_.reserve(estimatedWaveSize); nodesToExecuteNext_.reserve(estimatedWaveSize); while (!nodesToExecute_.empty()) { // Use stateful parallel_for: each thread collects ready nodes locally dispenso::parallel_for( taskSet, threadStates_, []() { return ThreadLocalCollector{}; }, size_t(0), nodesToExecute_.size(), [this](ThreadLocalCollector& collector, size_t i) { const NodeType* node = static_cast(nodesToExecute_[i]); node->run(); node->forEachDependent([&](const Node& d) { // Relaxed ordering is safe here because the parallel_for wave barrier // provides a happens-before relationship, and nodesToExecute_.swap() // between waves ensures visibility of all side effects from prior waves. if (decNumIncompletePredecessors( static_cast(d), std::memory_order_relaxed)) { collector.readyNodes.push_back(static_cast(&d)); } }); }); // Merge thread-local collections into next wave for (auto& collector : threadStates_) { if (!collector.readyNodes.empty()) { nodesToExecuteNext_.insert( nodesToExecuteNext_.end(), collector.readyNodes.begin(), collector.readyNodes.end()); collector.readyNodes.clear(); } } nodesToExecute_.swap(nodesToExecuteNext_); nodesToExecuteNext_.clear(); } } template void ConcurrentTaskSetExecutor::operator()( dispenso::ConcurrentTaskSet& tasks, const G& graph, bool wait) { using NodeType = typename G::NodeType; startNodes_.clear(); graph.forEachNode([&](const NodeType& node) { if (hasNoIncompletePredecessors(node)) { startNodes_.emplace_back(&node); } }); for (const Node* n : startNodes_) { const NodeType* node = static_cast(n); tasks.schedule([&tasks, node]() { evaluateNodeConcurrently(tasks, node); }); } if (wait) { tasks.wait(); } } template void setAllNodesIncomplete(const G& graph) { using NodeType = typename G::NodeType; graph.forEachNode([&](const NodeType& node) { node.numIncompletePredecessors_.store(node.numPredecessors(), std::memory_order_relaxed); }); } template void ForwardPropagator::operator()(const G& graph) { using NodeType = typename G::NodeType; nodesToVisit_.clear(); nodesToVisitNext_.clear(); visited_.clear(); groups_.clear(); graph.forEachNode([&](const NodeType& node) { if (!node.isCompleted()) { nodesToVisit_.emplace_back(&node); visited_.insert(&node); appendGroup(static_cast(&node), groups_); } }); while (!nodesToVisit_.empty()) { for (const Node* node : nodesToVisit_) { node->forEachDependent([&](const Node& d) { addIncompletePredecessor(d); if (visited_.insert(static_cast(&d)).second) { nodesToVisitNext_.emplace_back(static_cast(&d)); appendGroup(static_cast(&d), groups_); } }); } nodesToVisit_.swap(nodesToVisitNext_); nodesToVisitNext_.clear(); } propagateIncompleteStateBidirectionally(); } template <> void ForwardPropagator::propagateIncompleteStateBidirectionally() {} template <> void ForwardPropagator::propagateIncompleteStateBidirectionally() { nodesToVisit_.clear(); for (const std::vector* group : groups_) { for (const dispenso::BiPropNode* gnode : *group) { if (gnode->setIncomplete()) { nodesToVisit_.emplace_back(gnode); } } } for (const Node* node : nodesToVisit_) { const BiPropNode* biPropNode = static_cast(node); biPropNode->forEachDependent([](const Node& d) { ifIncompleteAddIncompletePredecessor(d); }); } } template DISPENSO_DLL_ACCESS void SingleThreadExecutor::operator()(const Graph&); template DISPENSO_DLL_ACCESS void SingleThreadExecutor::operator()(const BiPropGraph&); template DISPENSO_DLL_ACCESS void ParallelForExecutor::operator()( TaskSet&, const Graph&); template DISPENSO_DLL_ACCESS void ParallelForExecutor::operator()( TaskSet&, const BiPropGraph&); template DISPENSO_DLL_ACCESS void ParallelForExecutor::operator()( ConcurrentTaskSet& tasks, const Graph& graph); template DISPENSO_DLL_ACCESS void ParallelForExecutor::operator()( ConcurrentTaskSet& tasks, const BiPropGraph& graph); template DISPENSO_DLL_ACCESS void ConcurrentTaskSetExecutor::operator()( dispenso::ConcurrentTaskSet& tasks, const Graph& graph, bool wait); template DISPENSO_DLL_ACCESS void ConcurrentTaskSetExecutor::operator()( dispenso::ConcurrentTaskSet& tasks, const BiPropGraph& graph, bool wait); template DISPENSO_DLL_ACCESS void setAllNodesIncomplete(const Graph&); template DISPENSO_DLL_ACCESS void setAllNodesIncomplete(const BiPropGraph&); template DISPENSO_DLL_ACCESS void ForwardPropagator::operator()(const Graph&); template DISPENSO_DLL_ACCESS void ForwardPropagator::operator()(const BiPropGraph&); } // namespace dispenso ================================================ FILE: dispenso/graph_executor.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * @file graph_executor.h * @ingroup group_graph * A file providing executors for running task graphs. **/ #pragma once #include #include #include #include #include namespace dispenso { /** * Class to invoke Graph or BiPropGraph on current thread. **/ class SingleThreadExecutor : public ::detail::ExecutorBase { public: /** * Invoke the graph. This is not concurrency safe. * * @param graph graph to invoke **/ template void operator()(const G& graph); private: std::vector nodesToExecute_; std::vector nodesToExecuteNext_; }; /** * Class to invoke Graph or BiPropGraph using * dispenso::parallel_for for every layer of the graph. **/ class ParallelForExecutor : public ::detail::ExecutorBase { public: /** * Invoke the graph. This is not concurrency safe. * * @param taskSet taksSet to use with parallel_for. * @param graph graph to invoke **/ template void operator()(TaskSetT& taskSet, const G& graph); private: // Per-thread state for collecting ready nodes locally during parallel_for struct ThreadLocalCollector { std::vector readyNodes; }; std::vector nodesToExecute_; std::vector nodesToExecuteNext_; std::vector threadStates_; }; /** * Class to invoke Graph or BiPropGraph using * dispenso::ConcurrentTaskSet **/ class ConcurrentTaskSetExecutor : public ::detail::ExecutorBase { public: /** * Invoke the graph. This is not concurrency safe. * * @param tasks ConcurrentTaskSet to schedule tasks. * @param graph graph to invoke * @param wait if true run tasks.wait() at the end of the function **/ template void operator()(dispenso::ConcurrentTaskSet& tasks, const G& graph, bool wait = true); private: std::vector startNodes_; }; /** * Class to propagate incomplete state recursively from nodes to dependents **/ class ForwardPropagator : public ::detail::ExecutorBase { public: /** * Propagate incomplete state recursively from nodes to dependents * This is not concurrency safe. **/ template void operator()(const G& graph); private: template void propagateIncompleteStateBidirectionally(); std::vector nodesToVisit_; std::vector nodesToVisitNext_; std::unordered_set visited_; std::unordered_set*> groups_; }; } // namespace dispenso ================================================ FILE: dispenso/latch.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * @file latch.h * @ingroup group_sync * A file providing a Latch barrier type, which gives a way for threads to wait until all expected * threads have reached this point. This is intended to match API and behavior of C++20 std::latch. **/ #pragma once #include #include namespace dispenso { /** * A class which can be used for barrier scenarios. See e.g. * https://en.cppreference.com/w/cpp/thread/latch **/ class Latch { public: /** * Construct a latch with expected number of threads to wait on. * * @param threadGroupCount The number of threads in the group. **/ explicit Latch(uint32_t threadGroupCount) noexcept : impl_(threadGroupCount) {} /** * Decrement the counter in a non-blocking manner. **/ void count_down(uint32_t n = 1) noexcept { if (impl_.intrusiveStatus().fetch_sub(n, std::memory_order_acq_rel) == 1) { impl_.notify(0); } } /** * See if the count has been reduced to zero, indicating all necessary threads * have synchronized. * * @note try_wait is a misnomer, as the function never blocks. We kept the name to match C++20 * API. * @return true only if the internal counter has reached zero. **/ bool try_wait() const noexcept { return impl_.intrusiveStatus().load(std::memory_order_acquire) == 0; } /** * Wait for all threads to have synchronized. **/ void wait() const noexcept { impl_.wait(0); } /** * Decrement the counter and wait **/ void arrive_and_wait() noexcept { if (impl_.intrusiveStatus().fetch_sub(1, std::memory_order_acq_rel) > 1) { impl_.wait(0); } else { impl_.notify(0); } } private: detail::CompletionEventImpl impl_; }; } // namespace dispenso ================================================ FILE: dispenso/once_function.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * @file once_function.h * @ingroup group_core * A file providing OnceFunction, a class providing void() signature for closure to be called only * once. It is built to be cheap to create and move. **/ #pragma once #include #include #include namespace dispenso { #if DISPENSO_HAS_CONCEPTS /** * @concept OnceCallableFunc * @brief A callable suitable for wrapping in OnceFunction or scheduling as a task. * * The callable must be invocable with no arguments. The return value (if any) is discarded. * This is the fundamental requirement for functors passed to OnceFunction, * ThreadPool::schedule, TaskSet::schedule, and similar scheduling interfaces. **/ template concept OnceCallableFunc = std::invocable; #endif // DISPENSO_HAS_CONCEPTS namespace detail { template class FutureBase; template class FutureImplBase; } // namespace detail /** * A class fullfilling the void() signature, and operator() must be called exactly once for valid * OnceFunctions. This class can be much more efficient than std::function for type * erasing functors without too much state (currently < ~250 bytes). * @note The wrapped type-erased functor in OnceFunction is *not* deleted upon destruction, but * rather when operator() is called. It is the user's responsibility to ensure that operator() is * called. * **/ class OnceFunction { public: /** * Construct a OnceFunction with invalid state. **/ OnceFunction() #if defined DISPENSO_DEBUG : data_(nullptr), invoke_(nullptr) #endif // DISPENSO_DEBUG { } /** * Construct a OnceFunction with a valid functor. * * @param f A functor with signature void(). Ideally this should be a concrete functor (e.g. from * lambda), though it will work with e.g. std::function. The downside in the latter case is extra * overhead for double type erasure. **/ template DISPENSO_REQUIRES(OnceCallableFunc) OnceFunction(F&& f) { auto callable = detail::createOnceCallable(std::forward(f)); data_ = callable.data; invoke_ = callable.invoke; } OnceFunction(const OnceFunction& other) = delete; /** Move constructor. */ OnceFunction(OnceFunction&& other) noexcept : data_(other.data_), invoke_(other.invoke_) { #if defined DISPENSO_DEBUG other.data_ = nullptr; other.invoke_ = nullptr; #endif // DISPENSO_DEBUG } OnceFunction& operator=(OnceFunction&& other) noexcept { data_ = other.data_; invoke_ = other.invoke_; #if defined DISPENSO_DEBUG if (&other != this) { other.data_ = nullptr; other.invoke_ = nullptr; } #endif // DISPENSO_DEBUG return *this; } /** * Destroy the type-erased functor and release its resources without invoking it. * Use this when a OnceFunction will not be called but its resources must still be freed. * Like operator(), this must be called at most once, and the OnceFunction must not be used after. **/ void cleanupNotRun() const { #if defined DISPENSO_DEBUG assert(data_ != nullptr && "Must not cleanup an invalid OnceFunction!"); invoke_(data_, false); data_ = nullptr; invoke_ = nullptr; #else invoke_(data_, false); #endif // DISPENSO_DEBUG } /** * Invoke the type-erased functor. This function must be called exactly once. Fewer will result * in a leak, while more will invoke on an invalid object. **/ void operator()() const { #if defined DISPENSO_DEBUG assert(data_ != nullptr && "Must not use OnceFunction more than once!"); #endif // DISPENSO_DEBUG invoke_(data_, true); #if defined DISPENSO_DEBUG data_ = nullptr; invoke_ = nullptr; #endif // DISPENSO_DEBUG } private: OnceFunction(detail::OnceCallable* func, bool) : data_(func), invoke_(&detail::runOnceCallable) {} mutable void* data_; void (*invoke_)(void*, bool); template friend class detail::FutureBase; template friend class detail::FutureImplBase; }; } // namespace dispenso ================================================ FILE: dispenso/parallel_for.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * @file parallel_for.h * @ingroup group_parallel * Functions for performing parallel for loops. **/ #pragma once #include #include #include #include #include #include namespace dispenso { #if DISPENSO_HAS_CONCEPTS /** * @concept ParallelForRangeFunc * @brief A callable suitable for chunked parallel_for with (begin, end) signature. * * The callable must be invocable with two integer arguments representing the chunk range. **/ template concept ParallelForRangeFunc = std::invocable; /** * @concept ParallelForIndexFunc * @brief A callable suitable for element-wise parallel_for with single index signature. * * The callable must be invocable with a single integer argument representing the element index. **/ template concept ParallelForIndexFunc = std::invocable; /** * @concept ParallelForStateRangeFunc * @brief A callable suitable for stateful chunked parallel_for. * * The callable must be invocable with (State&, begin, end) arguments. **/ template concept ParallelForStateRangeFunc = std::invocable; /** * @concept ParallelForStateIndexFunc * @brief A callable suitable for stateful element-wise parallel_for. * * The callable must be invocable with (State&, index) arguments. **/ template concept ParallelForStateIndexFunc = std::invocable; #endif // DISPENSO_HAS_CONCEPTS /** * Chunking strategy. Typically if the cost of each loop iteration is roughly constant, kStatic * load balancing is preferred. Additionally, when making a non-waiting parallel_for call in * conjunction with other parallel_for calls or with other task submissions to a TaskSet, some * dynamic load balancing is automatically introduced, and selecting kStatic load balancing here can * be better. If the workload per iteration deviates a lot from constant, and some ranges may be * much cheaper than others, select kAuto. **/ enum class ParForChunking { kStatic, kAuto }; /** * A set of options to control parallel_for **/ struct ParForOptions { /** * The maximum number of threads to use. This can be used to limit the number of threads below * the number associated with the TaskSet's thread pool to control the degree of concurrency. * Setting maxThreads to zero or one will result in serial operation. **/ uint32_t maxThreads = std::numeric_limits::max(); /** * Specify whether the return of the parallel_for signifies the work is complete. If the * parallel_for is initiated without providing a TaskSet, the parallel_for will always wait. * * @note If wait is true, the calling thread will always participate in computation. If this is * not desired, pass wait as false, and wait manually outside of the parallel_for on the passed * TaskSet. **/ bool wait = true; /** * Specify whether default chunking should be static or auto (dynamic load balancing). This is * used when invoking the version of parallel_for that takes index parameters (vs a ChunkedRange). **/ ParForChunking defaultChunking = ParForChunking::kStatic; /** * Specify a minimum number of items per chunk for static or auto dynamic load balancing. Cheaper * workloads should have a higher number of minWorkItems. Will be ignored if an explicit chunk * size is provided to ChunkedRange. **/ uint32_t minItemsPerChunk = 1; /** * When set to false, and StateContainers are supplied to parallel_for, re-create container from * scratch each call to parallel_for. When true, reuse existing state as much as possible (only * create new state if we require more than is already available in the container). **/ bool reuseExistingState = false; }; /** * A helper class for parallel_for. It provides various configuration parameters to * describe how to break up work for parallel processing. ChunkedRanges can be created with Auto * chunking, Static chunking, or specific chunking. Auto chunking makes large chunks for better * cache utilization, but tries to make enough chunks to provide some dynamic load balancing. Static * chunking makes N chunks given N threads to run the loop on. User-specified chunking can be * useful for ensuring e.g. that at least a multiple of SIMD width is provided per chunk. * parallel_for calls that don't accept a ChunkedRange will create a ChunkedRange * internally using Auto chunking. **/ template struct ChunkedRange { // We need to utilize 64-bit integers to avoid overflow, e.g. passing -2**30, 2**30 as int32 will // result in overflow unless we cast to 64-bit. Note that if we have a range of e.g. -2**63+1 to // 2**63-1, we cannot hold the result in an int64_t. We could in a uint64_t, but it is quite // tricky to make this work. However, I do not expect ranges larger than can be held in int64_t // since people want their computations to finish before the heat death of the sun (slight // exaggeration). using size_type = std::conditional_t::value, int64_t, uint64_t>; struct Static {}; struct Auto {}; static constexpr IntegerT kStatic = std::numeric_limits::max(); /** * Create a ChunkedRange with specific chunk size * * @param s The start of the range. * @param e The end of the range. * @param c The chunk size. **/ ChunkedRange(IntegerT s, IntegerT e, IntegerT c) : start(s), end(e), chunk(c) {} /** * Create a ChunkedRange with chunk size equal to total items divided by number of threads. * * @param s The start of the range. * @param e The end of the range. **/ ChunkedRange(IntegerT s, IntegerT e, Static) : ChunkedRange(s, e, kStatic) {} /** * Create a ChunkedRange with chunk size determined automatically to enable some dynamic load * balancing. * * @param s The start of the range. * @param e The end of the range. **/ ChunkedRange(IntegerT s, IntegerT e, Auto) : ChunkedRange(s, e, 0) {} bool isStatic() const { return chunk == kStatic; } bool isAuto() const { return chunk == 0; } bool empty() const { return end <= start; } size_type size() const { return static_cast(end) - start; } template std::tuple calcChunkSize(OtherInt numLaunched, bool oneOnCaller, size_type minChunkSize) const { size_type workingThreads = static_cast(numLaunched) + size_type{oneOnCaller}; assert(workingThreads > 0); if (!chunk) { size_type dynFactor = std::min(16, size() / workingThreads); size_type chunkSize; do { size_type roughChunks = dynFactor * workingThreads; chunkSize = (size() + roughChunks - 1) / roughChunks; --dynFactor; } while (chunkSize < minChunkSize); return {chunkSize, (size() + chunkSize - 1) / chunkSize}; } else if (chunk == kStatic) { // This should never be called. The static distribution versions of the parallel_for // functions should be invoked instead. std::abort(); } return {chunk, (size() + chunk - 1) / chunk}; } IntegerT start; IntegerT end; IntegerT chunk; }; /** * Create a ChunkedRange with specified chunking strategy. * * @param start The start of the range. * @param end The end of the range. * @param chunking The strategy to use for chunking. **/ template inline ChunkedRange> makeChunkedRange(IntegerA start, IntegerB end, ParForChunking chunking = ParForChunking::kStatic) { using IntegerT = std::common_type_t; return (chunking == ParForChunking::kStatic) ? ChunkedRange(start, end, typename ChunkedRange::Static()) : ChunkedRange(start, end, typename ChunkedRange::Auto()); } /** * Create a ChunkedRange with specific chunk size * * @param start The start of the range. * @param end The end of the range. * @param chunkSize The chunk size. **/ template inline ChunkedRange> makeChunkedRange(IntegerA start, IntegerB end, IntegerC chunkSize) { return ChunkedRange>(start, end, chunkSize); } namespace detail { struct NoOpIter { using difference_type = std::ptrdiff_t; using value_type = int; using pointer = int*; using reference = int&; using iterator_category = std::random_access_iterator_tag; int& operator*() const { static DISPENSO_THREAD_LOCAL int dummy = 0; return dummy; } NoOpIter& operator++() { return *this; } NoOpIter operator++(int) { return *this; } NoOpIter& operator--() { return *this; } NoOpIter operator--(int) { return *this; } NoOpIter& operator+=(difference_type) { return *this; } NoOpIter& operator-=(difference_type) { return *this; } NoOpIter operator+(difference_type) const { return *this; } NoOpIter operator-(difference_type) const { return *this; } difference_type operator-(const NoOpIter&) const { return 0; } bool operator==(const NoOpIter&) const { return true; } bool operator!=(const NoOpIter&) const { return false; } bool operator<(const NoOpIter&) const { return false; } int& operator[](difference_type) const { static DISPENSO_THREAD_LOCAL int dummy = 0; return dummy; } }; struct NoOpContainer { size_t size() const { return 0; } bool empty() const { return true; } void clear() {} NoOpIter begin() { return {}; } void emplace_back(int) {} int& front() { static int i; return i; } }; struct NoOpStateGen { int operator()() const { return 0; } }; /** * Initialize states container with enough entries for the given thread count. * Respects reuseExistingState: when true, only adds entries if the container * doesn't already have enough. */ template void initStates( StateContainer& states, const StateGen& defaultState, size_t numNeeded, bool reuseExistingState) { if (!reuseExistingState) { states.clear(); } for (size_t i = states.size(); i < numNeeded; ++i) { states.emplace_back(defaultState()); } } template < typename TaskSetT, typename IntegerT, typename F, typename StateContainer, typename StateGen> void parallel_for_staticImpl( TaskSetT& taskSet, StateContainer& states, const StateGen& defaultState, const ChunkedRange& range, F&& f, ssize_t maxThreads, bool wait, bool reuseExistingState) { using size_type = typename ChunkedRange::size_type; size_type numThreads = std::min(taskSet.numPoolThreads() + wait, maxThreads); // Reduce threads used if they exceed work to be done. numThreads = std::min(numThreads, range.size()); detail::initStates(states, defaultState, static_cast(numThreads), reuseExistingState); auto chunking = detail::staticChunkSize(static_cast(range.size()), static_cast(numThreads)); IntegerT chunkSize = static_cast(chunking.ceilChunkSize); bool perfectlyChunked = static_cast(chunking.transitionTaskIndex) == numThreads; // Number of tasks to schedule (all but the last one if wait is true) size_type numToSchedule = wait ? numThreads - 1 : numThreads; if (numToSchedule > 0) { // Precompute range boundaries for the generator // First loop: indices [0, transitionTaskIndex) use ceilChunkSize // Second loop: indices [transitionTaskIndex, numToSchedule) use ceilChunkSize - 1 size_type transitionIdx = perfectlyChunked ? numToSchedule : chunking.transitionTaskIndex; IntegerT smallChunkSize = static_cast(chunkSize - !perfectlyChunked); taskSet.scheduleBulk( static_cast(numToSchedule), [&, chunkSize, smallChunkSize, transitionIdx](size_t idx) { // Calculate start position for this chunk IntegerT start; IntegerT thisChunkSize; if (static_cast(idx) < transitionIdx) { IntegerT i = static_cast(idx); start = static_cast(range.start + static_cast(i * chunkSize)); thisChunkSize = chunkSize; } else { // After transition, chunks are smaller by 1 IntegerT ti = static_cast(transitionIdx); IntegerT ri = static_cast(idx - transitionIdx); start = static_cast( range.start + static_cast(ti * chunkSize) + static_cast(ri * smallChunkSize)); thisChunkSize = smallChunkSize; } IntegerT end = static_cast(start + thisChunkSize); auto stateIt = states.begin(); std::advance(stateIt, static_cast(idx)); return [it = stateIt, start, end, f]() { auto recurseInfo = detail::PerPoolPerThreadInfo::parForRecurse(); f(*it, start, end); }; }); } if (wait) { // Execute the last chunk on the calling thread auto stateIt = states.begin(); std::advance(stateIt, static_cast(numThreads - 1)); // Calculate start of last chunk size_type transitionIdx = perfectlyChunked ? numThreads - 1 : chunking.transitionTaskIndex; IntegerT smallChunkSize = static_cast(chunkSize - !perfectlyChunked); IntegerT lastStart; if (numThreads - 1 < transitionIdx) { IntegerT i = static_cast(numThreads - 1); lastStart = static_cast(range.start + static_cast(i * chunkSize)); } else { IntegerT ti = static_cast(transitionIdx); IntegerT ri = static_cast(numThreads - 1 - transitionIdx); lastStart = static_cast( range.start + static_cast(ti * chunkSize) + static_cast(ri * smallChunkSize)); } f(*stateIt, lastStart, range.end); taskSet.wait(); } } template struct ChunkSizingResult { typename ChunkedRange::size_type maxThreads; bool isStatic; }; /** * Adjust the thread count and static/dynamic scheduling decision based on work size. * * The goal is to avoid oversubscription and ensure each thread gets enough work: * * 1. Cap maxThreads to available pool threads (plus calling thread if waiting). * 2. If minItemsPerChunk > 1, reduce thread count so each thread gets at least * that many items. If even after reduction the chunks are too small, fall back * to static scheduling (which distributes items evenly without atomic contention). * 3. If minItemsPerChunk == 1 and the range is smaller than the thread count, * fall back to static scheduling (auto mode) or cap threads (explicit dynamic). */ template ChunkSizingResult adjustChunkSizing( const ChunkedRange& range, typename ChunkedRange::size_type maxThreads, bool isStatic, uint32_t minItemsPerChunk, typename ChunkedRange::size_type poolThreads, bool wait) { using size_type = typename ChunkedRange::size_type; // Step 1: never use more threads than the pool actually has maxThreads = std::min(maxThreads, poolThreads + wait); if (minItemsPerChunk > 1) { // Step 2a: reduce threads so each gets at least minItemsPerChunk items size_type maxWorkers = range.size() / minItemsPerChunk; if (maxWorkers < maxThreads) { maxThreads = maxWorkers; } // Step 2b: if dynamic chunks would still be too small, use static scheduling if (maxThreads > 0 && range.size() / (maxThreads + wait) < minItemsPerChunk && range.isAuto()) { isStatic = true; } } else if (range.size() <= poolThreads + wait) { // Step 3: fewer items than threads — static is better (no atomic overhead) if (range.isAuto()) { isStatic = true; } else if (!range.isStatic()) { maxThreads = range.size() - wait; } } return {maxThreads, isStatic}; } /** * Dynamic (work-stealing) parallel_for implementation. * * Workers atomically claim chunks from a shared index and process them. * The ExitAction callback is invoked when a worker finds no more chunks; * this allows the no-wait path to deallocate a heap-allocated index when * the last worker exits, while the wait path passes a no-op. * * When wait is true, the calling thread participates as an additional worker * and blocks until all tasks complete. */ template < typename TaskSetT, typename IntegerT, typename F, typename StateContainer, typename IndexRef, typename ExitAction> void parallel_for_dynamicImpl( TaskSetT& taskSet, StateContainer& states, IntegerT start, IntegerT end, F&& f, size_t numToLaunch, typename ChunkedRange::size_type chunkSize, typename ChunkedRange::size_type numChunks, IndexRef& index, ExitAction exitAction, bool wait) { auto worker = [start, end, &index, f, chunkSize, numChunks, exitAction](auto& s) { auto recurseInfo = detail::PerPoolPerThreadInfo::parForRecurse(); while (true) { auto cur = index.fetch_add(1, std::memory_order_relaxed); if (cur >= numChunks) { exitAction(cur); break; } auto sidx = static_cast(start + cur * chunkSize); if (cur + 1 == numChunks) { f(s, sidx, end); } else { f(s, sidx, static_cast(sidx + chunkSize)); } } }; taskSet.scheduleBulk(static_cast(numToLaunch), [&states, &worker](size_t i) { auto it = states.begin(); std::advance(it, static_cast(i)); return [&s = *it, worker]() { worker(s); }; }); if (wait) { auto it = states.begin(); std::advance(it, static_cast(numToLaunch)); worker(*it); taskSet.wait(); } } } // namespace detail /** * Execute loop over the range in parallel. * * @param taskSet The task set to schedule the loop on. * @param states A container of State (actual type of State TBD by user). The * container will be resized to hold a State object per executing thread. Container * must provide emplace_back() and must be forward-iterable. Examples include std::vector, * std::deque, and std::list. These are the states passed into f, and states must * remain a valid object until work is completed. * @param defaultState A functor with signature State(). It will be called to initialize the * objects for states. * @param range The range defining the loop extents as well as chunking strategy. * @param f The functor to execute in parallel. Must have a signature like * void(State &s, size_t begin, size_t end). * @param options See ParForOptions for details. **/ template < typename TaskSetT, typename IntegerT, typename F, typename StateContainer, typename StateGen> void parallel_for( TaskSetT& taskSet, StateContainer& states, const StateGen& defaultState, const ChunkedRange& range, F&& f, ParForOptions options = {}) { if (range.empty()) { if (options.wait) { taskSet.wait(); } return; } using size_type = typename ChunkedRange::size_type; uint32_t minItemsPerChunk = std::max(1, options.minItemsPerChunk); size_type maxThreads = std::max(options.maxThreads, 1); bool isStatic = range.isStatic(); const size_type N = taskSet.numPoolThreads(); if (N == 0 || !options.maxThreads || range.size() <= minItemsPerChunk || detail::PerPoolPerThreadInfo::isParForRecursive(&taskSet.pool())) { detail::initStates(states, defaultState, 1, options.reuseExistingState); f(*states.begin(), range.start, range.end); if (options.wait) { taskSet.wait(); } return; } auto chunkSizing = detail::adjustChunkSizing(range, maxThreads, isStatic, minItemsPerChunk, N, options.wait); maxThreads = chunkSizing.maxThreads; isStatic = chunkSizing.isStatic; // If adjustment reduced threads below 2, run inline — not worth parallelizing. if (maxThreads < 2) { detail::initStates(states, defaultState, 1, options.reuseExistingState); f(*states.begin(), range.start, range.end); if (options.wait) { taskSet.wait(); } return; } if (isStatic) { detail::parallel_for_staticImpl( taskSet, states, defaultState, range, std::forward(f), static_cast(maxThreads), options.wait, options.reuseExistingState); return; } const size_type numToLaunch = std::min(maxThreads - options.wait, N); detail::initStates( states, defaultState, static_cast(numToLaunch + options.wait), options.reuseExistingState); if (numToLaunch == 1 && !options.wait) { taskSet.schedule( [&s = states.front(), range, f = std::move(f)]() { f(s, range.start, range.end); }); return; } auto chunkInfo = range.calcChunkSize(numToLaunch, options.wait, minItemsPerChunk); auto chunkSize = std::get<0>(chunkInfo); auto numChunks = std::get<1>(chunkInfo); if (options.wait) { alignas(kCacheLineSize) std::atomic index(0); detail::parallel_for_dynamicImpl( taskSet, states, range.start, range.end, std::forward(f), static_cast(numToLaunch), chunkSize, numChunks, index, [](auto) {}, options.wait); } else { using SizeType = decltype(numChunks); struct ChunkIndex { std::atomic index; }; static_assert(sizeof(ChunkIndex) <= kCacheLineSize, "ChunkIndex must fit in one cache line"); char* mem = allocSmallBuffer(); auto* ci = new (mem) ChunkIndex{{0}}; SizeType lastExit = numChunks + static_cast(numToLaunch) - 1; detail::parallel_for_dynamicImpl( taskSet, states, range.start, range.end, std::forward(f), static_cast(numToLaunch), chunkSize, numChunks, ci->index, [ci, lastExit](auto cur) { if (cur == lastExit) { deallocSmallBuffer(ci); } }, options.wait); } } /** * Execute loop over the range in parallel. * * @param taskSet The task set to schedule the loop on. * @param range The range defining the loop extents as well as chunking strategy. * @param f The functor to execute in parallel. Must have a signature like * void(size_t begin, size_t end). * @param options See ParForOptions for details. **/ template DISPENSO_REQUIRES(ParallelForRangeFunc) void parallel_for( TaskSetT& taskSet, const ChunkedRange& range, F&& f, ParForOptions options = {}) { detail::NoOpContainer container; parallel_for( taskSet, container, detail::NoOpStateGen(), range, [f = std::move(f)](int /*noop*/, auto i, auto j) { f(i, j); }, options); } /** * Execute loop over the range in parallel on the global thread pool, and wait until complete. * * @param range The range defining the loop extents as well as chunking strategy. * @param f The functor to execute in parallel. Must have a signature like * void(size_t begin, size_t end). * @param options See ParForOptions for details. options.wait will always be reset *to true. **/ template DISPENSO_REQUIRES(ParallelForRangeFunc) void parallel_for(const ChunkedRange& range, F&& f, ParForOptions options = {}) { TaskSet taskSet(globalThreadPool()); options.wait = true; parallel_for(taskSet, range, std::forward(f), options); } /** * Execute loop over the range in parallel on the global thread pool and block until loop *completion. * * @param states A container of State (actual type of State TBD by user). The * container will be resized to hold a State object per executing thread. Container * must provide emplace_back() and must be forward-iterable. Examples include std::vector, * std::deque, and std::list. These are the states passed into f, and states must * remain a valid object until work is completed. * @param defaultState A functor with signature State(). It will be called to initialize the * objects for states. * @param range The range defining the loop extents as well as chunking strategy. * @param f The functor to execute in parallel. Must have a signature like * void(State &s, size_t begin, size_t end). * @param options See ParForOptions for details. options.wait will always be reset *to true. **/ template void parallel_for( StateContainer& states, const StateGen& defaultState, const ChunkedRange& range, F&& f, ParForOptions options = {}) { TaskSet taskSet(globalThreadPool()); options.wait = true; parallel_for(taskSet, states, defaultState, range, std::forward(f), options); } /** * Execute loop over the range in parallel. * * @param taskSet The task set to schedule the loop on. * @param start The start of the loop extents. * @param end The end of the loop extents. * @param f The functor to execute in parallel. Must have a signature like * void(size_t index) or void(size_t begin, size_t end). * @param options See ParForOptions for details. **/ template < typename TaskSetT, typename IntegerA, typename IntegerB, typename F, std::enable_if_t::value, bool> = true, std::enable_if_t::value, bool> = true, std::enable_if_t::value, bool> = true> void parallel_for( TaskSetT& taskSet, IntegerA start, IntegerB end, F&& f, ParForOptions options = {}) { using IntegerT = std::common_type_t; auto range = makeChunkedRange(start, end, options.defaultChunking); parallel_for( taskSet, range, [f = std::move(f)](IntegerT s, IntegerT e) { for (IntegerT i = s; i < e; ++i) { f(i); } }, options); } /** @overload */ template < typename TaskSetT, typename IntegerA, typename IntegerB, typename F, std::enable_if_t::value, bool> = true, std::enable_if_t::value, bool> = true, std::enable_if_t::value, bool> = true> void parallel_for( TaskSetT& taskSet, IntegerA start, IntegerB end, F&& f, ParForOptions options = {}) { auto range = makeChunkedRange(start, end, options.defaultChunking); parallel_for(taskSet, range, std::forward(f), options); } /** * Execute loop over the range in parallel on the global thread pool and block on loop completion. * * @param start The start of the loop extents. * @param end The end of the loop extents. * @param f The functor to execute in parallel. Must have a signature like * void(size_t index) or void(size_t begin, size_t end). * @param options See ParForOptions for details. options.wait will always be reset *to true. **/ template < typename IntegerA, typename IntegerB, typename F, std::enable_if_t::value, bool> = true, std::enable_if_t::value, bool> = true> void parallel_for(IntegerA start, IntegerB end, F&& f, ParForOptions options = {}) { TaskSet taskSet(globalThreadPool()); options.wait = true; parallel_for(taskSet, start, end, std::forward(f), options); } /** * Execute loop over the range in parallel. * * @param taskSet The task set to schedule the loop on. * @param states A container of State (actual type of State TBD by user). The * container will be resized to hold a State object per executing thread. Container * must provide emplace_back() and must be forward-iterable. Examples include std::vector, * std::deque, and std::list. These are the states passed into f, and states must * remain a valid object until work is completed. * @param defaultState A functor with signature State(). It will be called to initialize the * objects for states. * @param start The start of the loop extents. * @param end The end of the loop extents. * @param f The functor to execute in parallel. Must have a signature like * void(State &s, size_t index) or * void(State &s, size_t begin, size_t end). * @param options See ParForOptions for details. **/ template < typename TaskSetT, typename IntegerA, typename IntegerB, typename F, typename StateContainer, typename StateGen, std::enable_if_t::value, bool> = true, std::enable_if_t::value, bool> = true, std::enable_if_t< detail::CanInvoke::value, bool> = true> void parallel_for( TaskSetT& taskSet, StateContainer& states, const StateGen& defaultState, IntegerA start, IntegerB end, F&& f, ParForOptions options = {}) { using IntegerT = std::common_type_t; auto range = makeChunkedRange(start, end, options.defaultChunking); parallel_for( taskSet, states, defaultState, range, [f = std::move(f)](auto& state, IntegerT s, IntegerT e) { for (IntegerT i = s; i < e; ++i) { f(state, i); } }, options); } /** @overload */ template < typename TaskSetT, typename IntegerA, typename IntegerB, typename F, typename StateContainer, typename StateGen, std::enable_if_t::value, bool> = true, std::enable_if_t::value, bool> = true, std::enable_if_t< detail::CanInvoke::value, bool> = true> void parallel_for( TaskSetT& taskSet, StateContainer& states, const StateGen& defaultState, IntegerA start, IntegerB end, F&& f, ParForOptions options = {}) { auto range = makeChunkedRange(start, end, options.defaultChunking); parallel_for(taskSet, states, defaultState, range, std::forward(f), options); } /** * Execute loop over the range in parallel on the global thread pool and block until loop *completion. * * @param states A container of State (actual type of State TBD by user). The * container will be resized to hold a State object per executing thread. Container * must provide emplace_back() and must be forward-iterable. Examples include std::vector, * std::deque, and std::list. These are the states passed into f, and states must * remain a valid object until work is completed. * @param defaultState A functor with signature State(). It will be called to initialize the * objects for states. * @param start The start of the loop extents. * @param end The end of the loop extents. * @param f The functor to execute in parallel. Must have a signature like * void(State &s, size_t index) or * void(State &s, size_t begin, size_t end). * @param options See ParForOptions for details. options.wait will always be reset *to true. **/ template < typename IntegerA, typename IntegerB, typename F, typename StateContainer, typename StateGen, std::enable_if_t::value, bool> = true, std::enable_if_t::value, bool> = true> void parallel_for( StateContainer& states, const StateGen& defaultState, IntegerA start, IntegerB end, F&& f, ParForOptions options = {}) { TaskSet taskSet(globalThreadPool()); options.wait = true; parallel_for(taskSet, states, defaultState, start, end, std::forward(f), options); } } // namespace dispenso ================================================ FILE: dispenso/pipeline.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * @file pipeline.h * @ingroup group_graph * A file providing utilities for parallel pipelining of work. **/ #pragma once #include #include namespace dispenso { /** * OpResult is like a poor-man's std::optional for those who wish to use dispenso pipeline filtering * in C++14. In C++17 and beyond, it is recommended to use std::optional instead. OpResult has * implicit construct from T, just like std::optional, and move/copy constructors and operators, * bool conversion, and value() function, but otherwise provides less functionality than * std::optional. **/ template using OpResult = detail::OpResult; /** * A simple constant representing maximum parallelism for a stage. This number has no particular * significance, and is simply here for convenience. **/ constexpr ssize_t kStageNoLimit = std::numeric_limits::max(); /** * Create a stage for use in the pipeline function. * * @param f A function-like object that can accept the result of the previous stage (if any), and * which produces the output for the next stage (if any). * @param limit How many threads may concurrently run work for this stage. Values larger than the * number of threads in the associated thread pool of the used ConcurrentTaskSet will be capped to * the size of the pool. * @return A stage object suitable for pipelining. **/ template auto stage(F&& f, ssize_t limit) { return detail::Stage(std::forward(f), limit); } /** * Pipeline work in stages. Pipelines allow stages to specify parallelism limits by using the * stage function, or a function-like object can simply be passed directly, indicating * a serial stage. Even if stages are serial, there can be parallelism between stages, so in a 3 * stage serial pipeline, the expected runtime is the max of the 3 stages runtimes (note that this * is in the absence of pipeline overheads and with an infinitely long workstream. In practice * speedup is somewhat less). This function will block until the entire pipeline has completed. * * @param pool The ThreadPool to run the work in. This inherently determines the upper bound for * parallelism of the pipeline. * @param sIn The stages to run. The first stage must be a Generator stage, the last must be a Sink * stage, and intermediate stages are Transform stages. * - If there is only one stage, it takes no * arguments, but returns a bool indicating completion (false means the pipeline is complete). * - Otherwise, the Generator stage takes no arguments and must return an OpResult or std::optional * value, and an invalid/nullopt result indicates that the Generator is done (no more values * forthcoming). * - Transform stages should accept the output of the prior stage (or output.value() in the case of * OpResult or std::optional), and should return either a value or an OpResult or std::optional * value if the Transform is capable of filtering results. Invalid/nullopt OpResult or std::optional * values indicate that the value should be filtered, and not passed on to the next stage. * - The Sink stage should accept the output of the prior stage, just as a Transform stage does, but * does not return any value (or at least the pipeline will ignore it). * * @note Exception behavior: If a stage function throws an exception, the pipeline will stop * producing new work (the generator checks for exceptions between iterations) and allow any * already-in-flight work to complete. Queued work that has not yet started will be discarded * without execution. The first captured exception is rethrown from pipeline() when the * pipeline winds down. Subsequent exceptions from concurrently in-flight stages are silently * discarded. All internal state (concurrency counters, resource slots, completion events) is * cleaned up via RAII, so the thread pool remains in a valid state after an exception and may be * reused for subsequent pipelines. When exceptions are disabled at compile time * (__cpp_exceptions not defined), all exception-related checks are eliminated with * zero overhead. **/ template void pipeline(ThreadPool& pool, Stages&&... sIn) { ConcurrentTaskSet tasks(pool); auto pipes = detail::makePipes(tasks, std::forward(sIn)...); pipes.execute(); pipes.wait(); } /** * Pipeline work in stages. Pipelines allow stages to specify parallelism limits by using the * stage function, or a function-like object can simply be passed directly, indicating * a serial stage. Even if stages are serial, there can be parallelism between stages, so in a 3 * stage serial pipeline, the expected runtime is the max of the 3 stages runtimes (note that this * is in the absence of pipeline overheads and with an infinitely long workstream. In practice * speedup is somewhat less). Work will be run on dispenso's global thread pool. This function will * block until the entire pipeline has completed. * * @param sIn The stages to run. The first stage must be a Generator stage, the last must be a Sink * stage, and intermediate stages are Transform stages. * - If there is only one stage, it takes no * arguments, but returns a bool indicating completion (false means the pipeline is complete). * - Otherwise, the Generator stage takes no arguments and must return an OpResult or std::optional * value, and an invalid/nullopt result indicates that the Generator is done (no more values * forthcoming). * - Transform stages should accept the output of the prior stage (or output.value() in the case of * OpResult or std::optional), and should return either a value or an OpResult or std::optional * value if the Transform is capable of filtering results. Invalid/nullopt OpResult or std::optional * values indicate that the value should be filtered, and not passed on to the next stage. * - The Sink stage should accept the output of the prior stage, just as a Transform stage does, but * does not return any value (or at least the pipeline will ignore it). * * @note See the ThreadPool overload of pipeline() for exception behavior details. **/ template void pipeline(Stages&&... sIn) { pipeline(globalThreadPool(), std::forward(sIn)...); } } // namespace dispenso ================================================ FILE: dispenso/platform.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * @file platform.h * @ingroup group_util * Platform constants and common utilities. **/ #pragma once #include #include #include #include #include #include #include #if defined(_MSC_VER) && \ (defined(_M_AMD64) || defined(_M_IX86) || defined(_M_ARM64) || defined(_M_ARM)) #include #endif namespace dispenso { #define DISPENSO_MAJOR_VERSION 1 #define DISPENSO_MINOR_VERSION 5 #define DISPENSO_PATCH_VERSION 1 // C++20 concepts support detection #if __cplusplus >= 202002L && defined(__cpp_concepts) && __cpp_concepts >= 201907L #define DISPENSO_HAS_CONCEPTS 1 #include #else #define DISPENSO_HAS_CONCEPTS 0 #endif /** * @def DISPENSO_REQUIRES * @brief Macro for conditionally applying C++20 concept constraints. * * On C++20 with concepts support, this expands to a requires clause. * On C++14/17, this expands to nothing, maintaining backward compatibility. * * Example usage: * @code * template * DISPENSO_REQUIRES(std::invocable) * void schedule(F&& f); * @endcode **/ #if DISPENSO_HAS_CONCEPTS #define DISPENSO_REQUIRES(...) requires(__VA_ARGS__) #else #define DISPENSO_REQUIRES(...) #endif #if defined(DISPENSO_SHARED_LIB) #if defined _WIN32 #if defined(DISPENSO_LIB_EXPORT) #define DISPENSO_DLL_ACCESS __declspec(dllexport) #else #define DISPENSO_DLL_ACCESS __declspec(dllimport) #endif // DISPENSO_LIB_EXPORT #elif defined(__clang__) || defined(__GNUC__) #define DISPENSO_DLL_ACCESS __attribute__((visibility("default"))) #endif // PLATFORM #endif // DISPENSO_SHARED_LIB #if !defined(DISPENSO_DLL_ACCESS) #define DISPENSO_DLL_ACCESS #endif // DISPENSO_DLL_ACCESS using ssize_t = std::make_signed::type; #if defined(__clang__) || defined(__GNUC__) #define DISPENSO_INLINE __attribute__((always_inline)) inline #elif defined(_MSC_VER) || defined(__INTEL_COMPILER) #define DISPENSO_INLINE __forceinline #else #define DISPENSO_INLINE inline #endif // PLATFORM /** * @var constexpr size_t kCacheLineSize * @brief A constant that defines a safe number of bytes+alignment to avoid false sharing. **/ #if defined(__APPLE__) && defined(__arm64__) constexpr size_t kCacheLineSize = 128; #else constexpr size_t kCacheLineSize = 64; #endif /** * @def DISPENSO_THREAD_LOCAL * @brief A macro that can be used when declaring a lightweight thread-local variable. **/ // TODO(bbudge): Non-gcc/clang/msvc platforms. #if defined(_MSC_VER) #define DISPENSO_THREAD_LOCAL __declspec(thread) #elif defined(__GNUC__) || defined(__clang__) #define DISPENSO_THREAD_LOCAL __thread #else #error Supply lightweight thread-locals for this compiler. Can define to thread_local if lightweight not available #endif #if (defined(__GNUC__) || defined(__clang__)) #define DISPENSO_EXPECT(a, b) __builtin_expect(a, b) #else #define DISPENSO_EXPECT(a, b) a #endif // clang-format off #if (defined(__GNUC__) || defined(__clang__)) #define DO_PRAGMA(X) _Pragma(#X) #define DISPENSO_DISABLE_WARNING_PUSH DO_PRAGMA(GCC diagnostic push) #define DISPENSO_DISABLE_WARNING_POP DO_PRAGMA(GCC diagnostic pop) #define DISPENSO_DISABLE_WARNING(warningName) DO_PRAGMA(GCC diagnostic ignored #warningName) #if !defined(__clang__) #define DISPENSO_DISABLE_WARNING_ZERO_VARIADIC_MACRO_ARGUMENTS #define DISPENSO_DISABLE_WARNING_GLOBAL_CONSTRUCTORS #else #define DISPENSO_DISABLE_WARNING_ZERO_VARIADIC_MACRO_ARGUMENTS \ DISPENSO_DISABLE_WARNING(-Wgnu-zero-variadic-macro-arguments) #define DISPENSO_DISABLE_WARNING_GLOBAL_CONSTRUCTORS \ DISPENSO_DISABLE_WARNING(-Wglobal-constructors) #endif #elif defined(_MSC_VER) #define DISPENSO_DISABLE_WARNING_PUSH __pragma(warning(push)) #define DISPENSO_DISABLE_WARNING_POP __pragma(warning(pop)) #define DISPENSO_DISABLE_WARNING(warningNumber) __pragma(warning(disable : warningNumber)) #define DISPENSO_DISABLE_WARNING_ZERO_VARIADIC_MACRO_ARGUMENTS #define DISPENSO_DISABLE_WARNING_GLOBAL_CONSTRUCTORS #else #define DISPENSO_DISABLE_WARNING_PUSH #define DISPENSO_DISABLE_WARNING_POP #define DISPENSO_DISABLE_WARNING_ZERO_VARIADIC_MACRO_ARGUMENTS #define DISPENSO_DISABLE_WARNING_GLOBAL_CONSTRUCTORS #endif // clang-format on /** * A wrapper that aligns the contained value to cache line boundaries. * * Useful for avoiding false sharing in concurrent data structures. * * @tparam T The type to wrap with cache line alignment. */ template class CacheAligned { public: CacheAligned() = default; /** Construct from a value. @param t The value to wrap. */ CacheAligned(T t) : t_(t) {} operator T&() { return t_; } operator const T&() const { return t_; } private: alignas(kCacheLineSize) T t_; }; namespace detail { template struct AlignedBuffer { alignas(alignof(T)) char b[sizeof(T)]; }; template struct alignas(kCacheLineSize) AlignedAtomic : public std::atomic {}; inline void* alignedMalloc(size_t bytes, size_t alignment) { alignment = std::max(alignment, sizeof(uintptr_t)); char* ptr = reinterpret_cast(::malloc(bytes + alignment)); uintptr_t base = reinterpret_cast(ptr); uintptr_t oldBase = base; uintptr_t mask = alignment - 1; base += alignment; base &= ~mask; uintptr_t* recovery = reinterpret_cast(base - sizeof(uintptr_t)); *recovery = oldBase; return reinterpret_cast(base); } inline void* alignedMalloc(size_t bytes) { return alignedMalloc(bytes, kCacheLineSize); } inline void alignedFree(void* ptr) { if (!ptr) { return; } char* p = reinterpret_cast(ptr); uintptr_t recovered = *reinterpret_cast(p - sizeof(uintptr_t)); ::free(reinterpret_cast(recovered)); } template struct AlignedFreeDeleter { void operator()(T* ptr) { ptr->~T(); detail::alignedFree(ptr); } }; template <> struct AlignedFreeDeleter { void operator()(void* ptr) { detail::alignedFree(ptr); } }; template std::shared_ptr make_shared(Args&&... args) { void* tv = alignedMalloc(sizeof(T), alignof(T)); T* t = new (tv) T(std::forward(args)...); return std::shared_ptr(t, AlignedFreeDeleter()); } inline constexpr uintptr_t alignToCacheLine(uintptr_t val) { constexpr uintptr_t kMask = kCacheLineSize - 1; val += kMask; val &= ~kMask; return val; } #if defined __x86_64__ || defined __i386__ inline void cpuRelax() { asm volatile("pause" ::: "memory"); } #elif defined _MSC_VER && (defined _M_AMD64 || defined _M_IX86) inline void cpuRelax() { _mm_pause(); } #elif defined __arm64__ || defined __aarch64__ inline void cpuRelax() { asm volatile("yield" ::: "memory"); } #elif defined _MSC_VER && (defined _M_ARM64 || defined _M_ARM) inline void cpuRelax() { __yield(); } #elif defined __powerpc__ || defined __POWERPC__ #if defined __APPLE__ inline void cpuRelax() { asm volatile("or r27,r27,r27" ::: "memory"); } #else inline void cpuRelax() { asm volatile("or 27,27,27" ::: "memory"); } #endif // APPLE #else // TODO: provide reasonable relax on other archs. inline void cpuRelax() {} #endif // ARCH // When statically chunking a range, it is generally not possible to use a single chunk size plus // remainder and get a good load distribution. By estimating too high, we can have idle threads. By // estimating too low, the remainder can be several times as large as the chunk for other threads. // Instead, we compute the chunk size that is the ceil of the fractional chunk size. That can be // used for the first transitionIndex values, while the remaining (chunks - transitionTaskIndex) // values will be ceilChunkSize - 1. struct StaticChunking { ssize_t transitionTaskIndex; ssize_t ceilChunkSize; }; inline StaticChunking staticChunkSize(ssize_t items, ssize_t chunks) { assert(chunks > 0); StaticChunking chunking; chunking.ceilChunkSize = (items + chunks - 1) / chunks; ssize_t numLeft = chunking.ceilChunkSize * chunks - items; chunking.transitionTaskIndex = chunks - numLeft; return chunking; } } // namespace detail } // namespace dispenso ================================================ FILE: dispenso/pool_allocator.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include namespace dispenso { template PoolAllocatorT::PoolAllocatorT( size_t chunkSize, size_t allocSize, std::function allocFunc, std::function deallocFunc) : chunkSize_(chunkSize), allocSize_(allocSize), chunksPerAlloc_(allocSize / chunkSize), allocFunc_(std::move(allocFunc)), deallocFunc_(std::move(deallocFunc)) { // Start off with at least enough space to store at least one set of chunks. chunks_.reserve(chunksPerAlloc_); } template char* PoolAllocatorT::alloc() { while (true) { uint32_t allocId = 0; if (kThreadSafe) { allocId = backingAllocLock_.fetch_or(1, std::memory_order_acquire); } if (allocId == 0) { if (chunks_.empty()) { char* buffer; if (backingAllocs2_.empty()) { buffer = reinterpret_cast(allocFunc_(allocSize_)); } else { buffer = backingAllocs2_.back(); backingAllocs2_.pop_back(); } backingAllocs_.push_back(buffer); // Push n-1 values into the chunks_ buffer, and then return the nth. for (size_t i = 0; i < chunksPerAlloc_ - 1; ++i) { chunks_.push_back(buffer); buffer += chunkSize_; } if (kThreadSafe) { backingAllocLock_.store(0, std::memory_order_release); } return buffer; } char* back = chunks_.back(); chunks_.pop_back(); if (kThreadSafe) { backingAllocLock_.store(0, std::memory_order_release); } return back; } else { std::this_thread::yield(); } } } template void PoolAllocatorT::dealloc(char* ptr) { // For now do not release any memory back to the deallocFunc until destruction. // TODO(bbudge): Consider cases where we haven't gotten below some threshold of ready chunks // in a while. In that case, we could begin tracking allocations, and try to assemble entire // starting allocations, possibly deferring a small amount to each alloc call. This would be // slower, but would ensure we don't get into a situation where we need a bunch of memory up // front, and then never again. while (true) { uint32_t allocId = 0; if (kThreadSafe) { allocId = backingAllocLock_.fetch_or(1, std::memory_order_acquire); } if (allocId == 0) { chunks_.push_back(ptr); if (kThreadSafe) { backingAllocLock_.store(0, std::memory_order_release); } break; } } } template void PoolAllocatorT::clear() { chunks_.clear(); if (backingAllocs2_.size() < backingAllocs_.size()) { std::swap(backingAllocs2_, backingAllocs_); } for (char* ba : backingAllocs_) { backingAllocs2_.push_back(ba); } backingAllocs_.clear(); } template PoolAllocatorT::~PoolAllocatorT() { for (char* backing : backingAllocs_) { deallocFunc_(backing); } for (char* backing : backingAllocs2_) { deallocFunc_(backing); } } template class PoolAllocatorT; template class PoolAllocatorT; } // namespace dispenso ================================================ FILE: dispenso/pool_allocator.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * @file pool_allocator.h * @ingroup group_alloc * A pool allocator to help reduce calls to the underlying allocation and deallocation functions * that can be provided custom backing allocation and deallocation functions, e.g. cudaMalloc, * cudaFree. **/ #pragma once #include #include #include #include namespace dispenso { /** * A pool allocator to help reduce calls to the underlying allocation and deallocation functions. **/ template class PoolAllocatorT { public: /** * Construct a PoolAllocator. * * @param chunkSize The chunk size for each pool allocation * @param allocSize The size of underlying slabs to be chunked * @param allocFunc The underlying allocation function for allocating slabs * @param deallocFunc The underlying deallocation function. Currently only called on destruction. **/ DISPENSO_DLL_ACCESS PoolAllocatorT( size_t chunkSize, size_t allocSize, std::function allocFunc, std::function deallocFunc); /** * Allocate a chunk from a slab * * @return The pointer to a buffer of chunkSize bytes **/ DISPENSO_DLL_ACCESS char* alloc(); /** * Deallocate a previously allocated chunk * * @param ptr The chunk to return to the available pool **/ DISPENSO_DLL_ACCESS void dealloc(char* ptr); /** * Effectively dealloc all previously allocated chunks. Useful for arenas. * This function is not thread safe, and no previously allocated chunks may be dealloc'd after * clear. **/ DISPENSO_DLL_ACCESS void clear(); /** * Get the total capicity allocated in chunks (how many alloc() could be called without triggering * allocFunc() if all chunks were available) **/ size_t totalChunkCapacity() const { return (backingAllocs2_.size() + backingAllocs_.size()) * chunksPerAlloc_; } /** * Destruct a PoolAllocator **/ DISPENSO_DLL_ACCESS ~PoolAllocatorT(); private: const size_t chunkSize_; const size_t allocSize_; const size_t chunksPerAlloc_; std::function allocFunc_; std::function deallocFunc_; // Use of a spin lock was found to be faster than std::mutex in benchmarks. alignas(kCacheLineSize) std::atomic backingAllocLock_{0}; std::vector backingAllocs_; std::vector backingAllocs2_; std::vector chunks_; }; /** Thread-safe pool allocator with internal locking. */ using PoolAllocator = PoolAllocatorT; /** Pool allocator without locking, for single-threaded use or external synchronization. */ using NoLockPoolAllocator = PoolAllocatorT; } // namespace dispenso ================================================ FILE: dispenso/priority.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include #if (defined(__unix__) || defined(unix)) && !defined(USG) #include #endif #if defined(__linux__) #include #include #include #elif defined(__MACH__) #include #include #include #elif defined(_WIN32) #include #elif defined(BSD) #include #include #endif namespace dispenso { namespace { DISPENSO_THREAD_LOCAL ThreadPriority g_threadPriority = ThreadPriority::kNormal; } // namespace ThreadPriority getCurrentThreadPriority() { return g_threadPriority; } #ifdef __MACH__ bool setCurrentThreadPriority(ThreadPriority prio) { mach_port_t threadport = pthread_mach_thread_np(pthread_self()); if (prio == ThreadPriority::kRealtime) { mach_timebase_info_data_t info; mach_timebase_info(&info); double msToAbsTime = ((double)info.denom / (double)info.numer) * 1000000.0; thread_time_constraint_policy_data_t time_constraints; time_constraints.period = 0; time_constraints.computation = static_cast(1.0 * msToAbsTime); time_constraints.constraint = static_cast(10.0 * msToAbsTime); time_constraints.preemptible = 0; if (thread_policy_set( threadport, THREAD_TIME_CONSTRAINT_POLICY, (thread_policy_t)&time_constraints, THREAD_TIME_CONSTRAINT_POLICY_COUNT) != KERN_SUCCESS) { return false; } } // https://fergofrog.com/code/cbowser/xnu/osfmk/kern/sched.h.html#_M/MAXPRI_USER struct thread_precedence_policy ttcpolicy; switch (prio) { case ThreadPriority::kLow: ttcpolicy.importance = 20; break; case ThreadPriority::kNormal: ttcpolicy.importance = 37; break; case ThreadPriority::kHigh: // fallthrough case ThreadPriority::kRealtime: ttcpolicy.importance = 63; break; } if (thread_policy_set( threadport, THREAD_PRECEDENCE_POLICY, (thread_policy_t)&ttcpolicy, THREAD_PRECEDENCE_POLICY_COUNT) != KERN_SUCCESS) { return false; } g_threadPriority = prio; return true; } #elif defined(_WIN32) // https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-setthreadpriority bool setCurrentThreadPriority(ThreadPriority prio) { if (prio == ThreadPriority::kRealtime) { if (!SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS)) { return false; } } if (prio == ThreadPriority::kHigh) { // Best effort SetPriorityClass(GetCurrentProcess(), HIGH_PRIORITY_CLASS); } bool success = false; switch (prio) { case ThreadPriority::kLow: success = SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_LOWEST); break; case ThreadPriority::kNormal: success = SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_NORMAL); break; case ThreadPriority::kHigh: // fallthrough case ThreadPriority::kRealtime: success = SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_HIGHEST); break; } if (!success) { return false; } g_threadPriority = prio; return true; } #elif defined(__linux__) bool setCurrentThreadPriority(ThreadPriority prio) { if (prio == ThreadPriority::kRealtime) { struct sched_param param; param.sched_priority = 99; if (pthread_setschedparam(pthread_self(), SCHED_FIFO, ¶m)) { return false; } } switch (prio) { case ThreadPriority::kLow: errno = 0; (void)!nice(10); break; case ThreadPriority::kNormal: errno = 0; (void)!nice(0); break; case ThreadPriority::kHigh: // fallthrough case ThreadPriority::kRealtime: { struct rlimit rlim; getrlimit(RLIMIT_NICE, &rlim); if (rlim.rlim_max <= 20) { return false; } rlim.rlim_cur = rlim.rlim_max; setrlimit(RLIMIT_NICE, &rlim); errno = 0; (void)!nice(static_cast(20 - rlim.rlim_max)); } } if (errno != 0) { return false; } g_threadPriority = prio; return true; } #elif defined(__FreeBSD__) // TODO: Find someone who has a FreeBSD system to test this code. bool setCurrentThreadPriority(ThreadPriority prio) { struct rtprio rtp; if (prio == ThreadPriority::kRealtime) { rtp.type = RTP_PRIO_REALTIME; rtp.prio = 10; if (rtprio_thread(RTP_SET, 0, &rtp)) { return false; } } else { rtp.type = RTP_PRIO_NORMAL; switch (prio) { case ThreadPriority::kLow: rtp.prio = 31; break; case ThreadPriority::kNormal: rtp.prio = 15; break; case ThreadPriority::kHigh: // fallthrough case ThreadPriority::kRealtime: rtp.prio = 0; break; } if (rtprio_thread(RTP_SET, 0, &rtp)) { return false; } } g_threadPriority = prio; return true; } #else bool setCurrentThreadPriority(ThreadPriority prio) { return false; } #endif // platform } // namespace dispenso ================================================ FILE: dispenso/priority.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * @file priority.h * @ingroup group_util * * Utilities for getting and setting thread priority. This is an attempt to unify concepts for * thread priority usefully across multiple platforms. For finer control, use platform specific * functionality. * * @note When using higher-than-normal priority, use caution! Too many threads running at too high * priority can have a strong negative impact on the responsivity of the machine. Prefer to use * realtime priority only for short running tasks that need to be very responsively run. **/ #pragma once #include namespace dispenso { /** * A thread priority setting. Enum values in increasing order of priority. **/ enum class ThreadPriority { kLow, kNormal, kHigh, kRealtime }; /** * Access the current thread priority as set by setCurrentThreadPriority. * * @return The priority of the current thread * * @note If the current thread priority has been set via a platform-specific mechanism, this may * return an incorrect value. **/ DISPENSO_DLL_ACCESS ThreadPriority getCurrentThreadPriority(); /** * Set the current thread's priority * * @param prio The priority to set to * * @return true if the priority was modified, false otherwise. **/ DISPENSO_DLL_ACCESS bool setCurrentThreadPriority(ThreadPriority prio); } // namespace dispenso ================================================ FILE: dispenso/resource_pool.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * @file resource_pool.h * @ingroup group_containers * A file providing ResourcePool. This is syntactic sugar over what is essentially a set of * semaphore guarded resources. **/ #pragma once #include #include #include namespace dispenso { template class ResourcePool; /** * A RIAA wrapper for a user's type that can manage accessibility and ensures the resource will go * back into the ResourcePool upon destruction. **/ template class Resource { public: /** Move constructor. */ Resource(Resource&& other) : resource_(other.resource_), pool_(other.pool_) { other.resource_ = nullptr; } Resource& operator=(Resource&& other) { if (&other != this) { recycle(); resource_ = other.resource_; pool_ = other.pool_; other.resource_ = nullptr; } return *this; } /** * Access the underlying resource object. * * @return a reference to the resource. **/ T& get() { return *resource_; } ~Resource() { recycle(); } private: Resource(T* res, ResourcePool* pool) : resource_(res), pool_(pool) {} void recycle(); T* resource_; ResourcePool* pool_; friend class ResourcePool; }; /** * A pool of resources that can be accessed from multiple threads. This is akin to a set of * resources and a semaphore ensuring enough resources exist. **/ template class ResourcePool { public: /** * Construct a ResourcePool. * * @param size The number of T objects in the pool. * @param init A functor with signature T() which can be called to initialize the pool's * resources. **/ template ResourcePool(size_t size, const F& init) : pool_(size), backingResources_( reinterpret_cast( detail::alignedMalloc(size * detail::alignToCacheLine(sizeof(T))))), size_(size) { char* buf = backingResources_; // There are three reasons we create our own buffer and use placement new: // 1. We want to be able to handle non-movable non-copyable objects // * Note that we could do this with std::deque // 2. We want to minimize memory allocations, since that can be a common point of contention in // multithreaded programs. // 3. We can easily ensure that the objects are cache aligned to help avoid false sharing. for (size_t i = 0; i < size; ++i) { pool_.enqueue(new (buf) T(init())); buf += detail::alignToCacheLine(sizeof(T)); } } /** * Acquire a resource from the pool. This function may block until a resource becomes available. * * @return a Resource-wrapped resource. **/ Resource acquire() { T* t; DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_BEGIN(); pool_.wait_dequeue(t); DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_END(); return Resource(t, this); } /** * Destruct the ResourcePool. The user must ensure that all resources are returned to the pool * prior to destroying the pool. **/ ~ResourcePool() { assert(pool_.size_approx() == size_); for (size_t i = 0; i < size_; ++i) { T* t; DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_BEGIN(); pool_.wait_dequeue(t); DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_END(); t->~T(); } detail::alignedFree(backingResources_); } private: void recycle(T* t) { DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_BEGIN(); pool_.enqueue(t); DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_END(); } moodycamel::BlockingConcurrentQueue pool_; char* backingResources_; size_t size_; friend class Resource; }; template void Resource::recycle() { if (resource_) { pool_->recycle(resource_); } } } // namespace dispenso ================================================ FILE: dispenso/rw_lock.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * @file rw_lock.h * @ingroup group_sync * A file providing RWLock, a reader/writer lock interface. **/ #include namespace dispenso { /** * A reader/writer lock interface compatible with std::shared_mutex (for use with std::unique_lock * and std::shared_lock). The interface is designed to be very fast in the face of high levels of * contention for high read traffic and low write traffic. * * @note RWLock is not as fully-featured as std::shared_mutex: It does not go to the OS to wait. * This behavior is good for guarding very fast operations, but less good for guarding very slow * operations. Additionally, RWLock is not compatible with std::condition_variable, though * std::condition_variable_any may work (untested). It could be possible to extend RWLock with it's * own ConditionVariable, make waiting operations sleep in the OS, and also to add timed functions; * however those may slow things down in the fast case. If some/all of that functionality is * needed, use std::shared_mutex, or develop a new type. **/ class alignas(kCacheLineSize) RWLock : public detail::RWLockImpl { public: /** * Locks for write access * * @note It is undefined behavior to recursively lock **/ using detail::RWLockImpl::lock; /** * Tries to lock for write access, returns if unable to lock * * @return true if lock was acquired, false otherwise **/ using detail::RWLockImpl::try_lock; /** * Unlocks write access * * @note Must already be locked by the current thread of execution, otherwise, the behavior is * undefined. **/ using detail::RWLockImpl::unlock; /** * Locks for read access * * @note It is undefined behavior to recursively lock **/ using detail::RWLockImpl::lock_shared; /** * Tries to lock for read access, returns if unable to lock * * @return true if lock was acquired, false otherwise * * @note It is undefined behavior to recursively lock **/ using detail::RWLockImpl::try_lock_shared; /** * Unlocks read access * * @note Must already be locked by the current thread of execution, otherwise, the behavior is * undefined. **/ using detail::RWLockImpl::unlock_shared; /** * Upgrade from a reader lock to a writer lock. lock_upgrade is a power-user interface. There is * a very good reason why it is not exposed as upgrade_mutex in the standard. To use it safely, * you *MUST* ensure only one thread can try to lock for write concurrently. If that cannot be * guaranteed, you should unlock for read, and lock for write instead of using lock_upgrade to * avoid potential deadlock. * * @note Calling this if the writer lock is already held, or if no reader lock is already held is * undefined behavior. **/ using detail::RWLockImpl::lock_upgrade; /** * Downgrade the lock from a writer lock to a reader lock. * * @note Calling this if the writer lock is not held results in undefined behavior **/ using detail::RWLockImpl::lock_downgrade; }; /** * An unaligned version of the RWLock. This could be useful if you e.g. want to create an array of * these to guard a large number of slots, and the likelihood of multiple threads touching any * region concurrently is low. All other behavior remains the same, so refer to the documentation * for RWLock. **/ class UnalignedRWLock : public detail::RWLockImpl {}; } // namespace dispenso ================================================ FILE: dispenso/schedulable.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * @file schedulable.h * @ingroup group_core * Classes providing simple schedulables that match scheduling interfaces of *TaskSet and ThreadPool * **/ #pragma once #include #include namespace dispenso { /** * A class fullfilling the Schedulable concept that immediately invokes the functor. This can be * used in place of ThreadPool or TaskSet with Futures at * construction or through then, or it may be used in TimedTask scheduling for * short-running tasks. **/ class ImmediateInvoker { public: /** * Schedule a functor to be executed. It will be invoked immediately. * * @param f The functor to be executed. f's signature must match void(). Best * performance will come from passing lambdas, other concrete functors, or OnceFunction, but * std::function or similarly type-erased objects will also work. **/ template DISPENSO_REQUIRES(OnceCallableFunc) void schedule(F&& f) const { f(); } /** * Schedule a functor to be executed. It is a bit oxymoronical to call this function, since * ForceQueuingTag will have no effect, and it's use is discouraged. * **/ template DISPENSO_REQUIRES(OnceCallableFunc) void schedule(F&& f, ForceQueuingTag) const { f(); } }; constexpr ImmediateInvoker kImmediateInvoker; /** * A class fullfilling the Schedulable concept that always invokes on a new thread. This can be * used in place of ThreadPool or TaskSet with Futures at * construction or through then. **/ class NewThreadInvoker { public: /** * Schedule a functor to be executed on a new thread. * * @param f The functor to be executed. f's signature must match void(). Best * performance will come from passing lambdas, other concrete functors, or OnceFunction, but * std::function or similarly type-erased objects will also work. **/ template DISPENSO_REQUIRES(OnceCallableFunc) void schedule(F&& f) const { schedule(std::forward(f), ForceQueuingTag()); } /** * Schedule a functor to be executed on a new thread. * * @param f The functor to be executed. f's signature must match void(). Best * performance will come from passing lambdas, other concrete functors, or OnceFunction, but * std::function or similarly type-erased objects will also work. **/ template DISPENSO_REQUIRES(OnceCallableFunc) void schedule(F&& f, ForceQueuingTag) const { std::thread thread([f = std::move(f)]() { f(); }); thread.detach(); } private: }; constexpr NewThreadInvoker kNewThreadInvoker; } // namespace dispenso ================================================ FILE: dispenso/small_buffer_allocator.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include namespace dispenso { namespace detail { template SmallBufferGlobals& getSmallBufferGlobals() { // controlled leak here static SmallBufferGlobals* globals = new SmallBufferGlobals(); return *globals; } char* allocSmallBufferImpl(size_t ordinal) { switch (ordinal) { case 0: return detail::SmallBufferAllocator<4>::alloc(); case 1: return detail::SmallBufferAllocator<8>::alloc(); case 2: return detail::SmallBufferAllocator<16>::alloc(); case 3: return detail::SmallBufferAllocator<32>::alloc(); case 4: return detail::SmallBufferAllocator<64>::alloc(); case 5: return detail::SmallBufferAllocator<128>::alloc(); case 6: return detail::SmallBufferAllocator<256>::alloc(); default: assert(false && "Invalid small buffer ordinal requested"); return nullptr; } } void deallocSmallBufferImpl(size_t ordinal, void* buf) { switch (ordinal) { case 0: detail::SmallBufferAllocator<4>::dealloc(reinterpret_cast(buf)); break; case 1: detail::SmallBufferAllocator<8>::dealloc(reinterpret_cast(buf)); break; case 2: detail::SmallBufferAllocator<16>::dealloc(reinterpret_cast(buf)); break; case 3: detail::SmallBufferAllocator<32>::dealloc(reinterpret_cast(buf)); break; case 4: detail::SmallBufferAllocator<64>::dealloc(reinterpret_cast(buf)); break; case 5: detail::SmallBufferAllocator<128>::dealloc(reinterpret_cast(buf)); break; case 6: detail::SmallBufferAllocator<256>::dealloc(reinterpret_cast(buf)); break; default: assert(false && "Invalid small buffer ordinal requested"); } } size_t approxBytesAllocatedSmallBufferImpl(size_t ordinal) { switch (ordinal) { case 0: return detail::SmallBufferAllocator<4>::bytesAllocated(); case 1: return detail::SmallBufferAllocator<8>::bytesAllocated(); case 2: return detail::SmallBufferAllocator<16>::bytesAllocated(); case 3: return detail::SmallBufferAllocator<32>::bytesAllocated(); case 4: return detail::SmallBufferAllocator<64>::bytesAllocated(); case 5: return detail::SmallBufferAllocator<128>::bytesAllocated(); case 6: return detail::SmallBufferAllocator<256>::bytesAllocated(); default: assert(false && "Invalid small buffer ordinal requested"); return 0; } } template SmallBufferAllocator::PerThreadQueuingData::~PerThreadQueuingData() { enqueue_bulk(buffers_, count_); DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_BEGIN(); ptoken().~ProducerToken(); ctoken().~ConsumerToken(); DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_END(); } template class SmallBufferAllocator<4>; template class SmallBufferAllocator<8>; template class SmallBufferAllocator<16>; template class SmallBufferAllocator<32>; template class SmallBufferAllocator<64>; template class SmallBufferAllocator<128>; template class SmallBufferAllocator<256>; } // namespace detail } // namespace dispenso ================================================ FILE: dispenso/small_buffer_allocator.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * @file small_buffer_allocator.h * @ingroup group_alloc * A file providing SmallBufferAllocator. This allocator can allocate and deallocate chunks of a * set size in a way that is efficient and scales quite well across many threads. **/ #pragma once #include #include namespace dispenso { /** * Set a standard for the maximum chunk size for use within dispenso. The reason for this limit is * that there are diminishing returns after a certain size, and each new pool has it's own memory * overhead. **/ constexpr size_t kMaxSmallBufferSize = 256; namespace detail { DISPENSO_DLL_ACCESS char* allocSmallBufferImpl(size_t ordinal); DISPENSO_DLL_ACCESS void deallocSmallBufferImpl(size_t ordinal, void* buf); DISPENSO_DLL_ACCESS size_t approxBytesAllocatedSmallBufferImpl(size_t ordinal); // This has the effect of selecting actual block sizes starting with 4 bytes. Smaller requests // (e.g. 1 byte, 2 bytes) will still utilize 4-byte blocks. Choice of 4 bytes as the smallest // mainly aligns to sizeof(ptr) on 32-bit platforms, where we'd expect most common use cases to be // no smaller than one pointer. Retaining 4-byte buckets on 64-bit platforms doesn't cost much // (tiny startup/teardown cost, and trivial amount of memory) when not using 4-byte or smaller // allocations, and makes the code simpler. constexpr size_t getOrdinal(size_t blockSize) { return static_cast(std::max(0, static_cast(log2const(blockSize)) - 2)); } template inline std::enable_if_t<(kBlockSize <= kMaxSmallBufferSize), char*> allocSmallOrLarge() { #if defined(DISPENSO_NO_SMALL_BUFFER_ALLOCATOR) return reinterpret_cast(alignedMalloc(kBlockSize, kBlockSize)); #else return allocSmallBufferImpl(getOrdinal(kBlockSize)); #endif // DISPENSO_NO_SMALL_BUFFER_ALLOCATOR } template inline std::enable_if_t<(kBlockSize > kMaxSmallBufferSize), char*> allocSmallOrLarge() { return reinterpret_cast(alignedMalloc(kBlockSize, kBlockSize)); } template inline std::enable_if_t<(kBlockSize <= kMaxSmallBufferSize), void> deallocSmallOrLarge(void* buf) { #if defined(DISPENSO_NO_SMALL_BUFFER_ALLOCATOR) alignedFree(buf); #else deallocSmallBufferImpl(getOrdinal(kBlockSize), buf); #endif // DISPENSO_NO_SMALL_BUFFER_ALLOCATOR } template inline std::enable_if_t<(kBlockSize > kMaxSmallBufferSize), void> deallocSmallOrLarge(void* buf) { alignedFree(buf); } } // namespace detail /** * Allocate a small buffer from a small buffer pool. * * @tparam kBlockSize The size of the block to allocate. Must be a power of two, and must be less * than or equal to kMaxSmallBufferSize. * @return The pointer to the allocated block of memory. * @note: The returned buffer must be returned to the pool via deallocSmallBuffer templatized on the * same block size. If kBlockSize > kMaxSmallBufferSize, this function falls back on alignedMalloc. * If DISPENSO_NO_SMALL_BUFFER_ALLOCATOR is defined, we will always fall back on * alignedMalloc/alignedFree. **/ template inline char* allocSmallBuffer() { return detail::allocSmallOrLarge(); } /** * Free a small buffer from a small buffer pool. * * @tparam kBlockSize The size of the block to allocate. Must be a power of two, and must be less * than or equal to kMaxSmallBufferSize. * @param buf the pointer to block of memory to return to the pool. Must have been allocated with * allocSmallBuffer templatized on the same block size. * @note: If kBlockSize > kMaxSmallBufferSize, this function falls back on alignedFree. **/ template inline void deallocSmallBuffer(void* buf) { detail::deallocSmallOrLarge(buf); } /** * Get the approximate bytes allocated for a single small buffer pool (associated with *kBlockSize). This function is not highly performant and locks, and should only be used for *diagnostics (e.g. tests). * * @tparam kBlockSize The block size for the pool to query. **/ template size_t approxBytesAllocatedSmallBuffer() { return detail::approxBytesAllocatedSmallBufferImpl(detail::getOrdinal(kBlockSize)); } } // namespace dispenso ================================================ FILE: dispenso/small_vector.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * @file small_vector.h * @ingroup group_util * A vector-like container with inline storage for small sizes. * * SmallVector stores up to N elements inline (on the stack), falling back to * heap allocation when size exceeds N. Uses the high bit of the size field to * track storage mode, keeping the struct compact with no extra flags. * * Once transitioned to heap storage, the vector stays on heap until clear() * is called. This ensures reserve() guarantees are honored and avoids * unnecessary copies between storage modes. **/ #pragma once #include #include #include #include #include namespace dispenso { /** * A vector-like container that stores up to N elements inline (on the stack), * avoiding heap allocation for small sizes. Falls back to heap allocation when * the size exceeds N. * * @tparam T The element type. * @tparam N The number of elements to store inline (default 4). Must be in range [1, 65535]. * * Memory layout: Uses a union to share space between inline storage and heap * pointer/capacity. The high bit of the size field tracks whether heap storage * is active, so no additional bool or flag field is needed. * * Once on heap, the vector stays on heap until clear() is called. This ensures * reserve() capacity is preserved across push_back/pop_back sequences. */ template class SmallVector { static_assert(N > 0, "SmallVector requires at least 1 inline element. Use std::vector for N=0."); static_assert(N < 65536, "SmallVector inline capacity is too large. Use std::vector instead."); public: using value_type = T; using size_type = size_t; using difference_type = std::ptrdiff_t; using reference = T&; using const_reference = const T&; using pointer = T*; using const_pointer = const T*; using iterator = T*; using const_iterator = const T*; /** * Construct an empty SmallVector. **/ SmallVector() noexcept : size_(0) {} /** * Construct a SmallVector with @p count default-constructed elements. * * @param count The number of elements. **/ explicit SmallVector(size_type count) : size_(0) { resize(count); } /** * Construct a SmallVector with @p count copies of @p value. * * @param count The number of elements. * @param value The value to copy into each element. **/ SmallVector(size_type count, const T& value) : size_(0) { resize(count, value); } /** * Construct a SmallVector from an initializer list. * * @param init The initializer list of elements. **/ SmallVector(std::initializer_list init) : size_(0) { ensureCapacity(init.size()); for (const auto& v : init) { emplace_back(v); } } /** Copy constructor. **/ SmallVector(const SmallVector& other) : size_(0) { ensureCapacity(other.rawSize()); for (const auto& v : other) { emplace_back(v); } } /** Move constructor. Moves elements and leaves @p other empty. **/ SmallVector(SmallVector&& other) noexcept : size_(0) { if (other.isInline()) { for (size_type i = 0; i < other.rawSize(); ++i) { new (inlineData() + i) T(std::move(other.inlineData()[i])); other.inlineData()[i].~T(); } } else { storage_.heap_.ptr = other.storage_.heap_.ptr; storage_.heap_.capacity = other.storage_.heap_.capacity; size_ |= kHeapBit; } size_ = (size_ & kHeapBit) | other.rawSize(); other.size_ = 0; } ~SmallVector() { destroyAll(); } /** Copy assignment operator. **/ SmallVector& operator=(const SmallVector& other) { if (this != &other) { destroyAll(); size_ = 0; ensureCapacity(other.rawSize()); for (const auto& v : other) { emplace_back(v); } } return *this; } /** Move assignment operator. **/ SmallVector& operator=(SmallVector&& other) noexcept { if (this != &other) { destroyAll(); size_ = 0; if (other.isInline()) { for (size_type i = 0; i < other.rawSize(); ++i) { new (inlineData() + i) T(std::move(other.inlineData()[i])); other.inlineData()[i].~T(); } } else { storage_.heap_.ptr = other.storage_.heap_.ptr; storage_.heap_.capacity = other.storage_.heap_.capacity; size_ |= kHeapBit; } size_ = (size_ & kHeapBit) | other.rawSize(); other.size_ = 0; } return *this; } // --- Element Access --- /** Access element at @p pos (unchecked). **/ reference operator[](size_type pos) { return data()[pos]; } /** @copydoc operator[](size_type) **/ const_reference operator[](size_type pos) const { return data()[pos]; } /** Access the first element. **/ reference front() { return data()[0]; } /** @copydoc front() **/ const_reference front() const { return data()[0]; } /** Access the last element. **/ reference back() { return data()[rawSize() - 1]; } /** @copydoc back() **/ const_reference back() const { return data()[rawSize() - 1]; } /** Return a pointer to the underlying element storage. **/ pointer data() noexcept { return isInline() ? inlineData() : storage_.heap_.ptr; } /** @copydoc data() **/ const_pointer data() const noexcept { return isInline() ? inlineData() : storage_.heap_.ptr; } // --- Iterators --- /** Return an iterator to the first element. **/ iterator begin() noexcept { return data(); } /** @copydoc begin() **/ const_iterator begin() const noexcept { return data(); } /** @copydoc begin() **/ const_iterator cbegin() const noexcept { return data(); } /** Return an iterator past the last element. **/ iterator end() noexcept { return data() + rawSize(); } /** @copydoc end() **/ const_iterator end() const noexcept { return data() + rawSize(); } /** @copydoc end() **/ const_iterator cend() const noexcept { return data() + rawSize(); } // --- Capacity --- /** Check whether the vector is empty. **/ bool empty() const noexcept { return rawSize() == 0; } /** Return the number of elements. **/ size_type size() const noexcept { return rawSize(); } /** * Return the current capacity. * Returns N while using inline storage, or the heap capacity otherwise. **/ size_type capacity() const noexcept { return isInline() ? N : storage_.heap_.capacity; } /** * Reserve capacity for at least newCap elements. * If newCap > N, transitions to heap storage. * Once on heap, capacity is preserved until clear(). */ void reserve(size_type newCap) { ensureCapacity(newCap); } // --- Modifiers --- /** * Remove all elements and release heap storage (if any). * After clear(), the vector returns to inline storage mode. **/ void clear() noexcept { destroyAll(); size_ = 0; } /** Append a copy of @p value. **/ void push_back(const T& value) { emplace_back(value); } /** Append @p value by moving. **/ void push_back(T&& value) { emplace_back(std::move(value)); } /** * Construct an element in-place at the end. * * @param args Arguments forwarded to the element constructor. * @return A reference to the newly constructed element. **/ template reference emplace_back(Args&&... args) { T* ptr; if (isInline()) { size_type sz = rawSize(); if (sz < N) { ptr = inlineData(); } else { growToHeap(N * 2); ptr = storage_.heap_.ptr; } } else { size_type sz = rawSize(); if (sz == storage_.heap_.capacity) { growToHeap(storage_.heap_.capacity * 2); } ptr = storage_.heap_.ptr; } size_type idx = rawSize(); new (ptr + idx) T(std::forward(args)...); // Increment preserves heap bit naturally ++size_; assert(rawSize() > 0 && "Size overflow into heap bit"); return ptr[idx]; } /** Remove the last element. **/ void pop_back() { T* ptr = data(); size_type sz = rawSize(); ptr[sz - 1].~T(); // Decrement preserves heap bit naturally --size_; } /** * Resize the vector to @p count elements, default-constructing new elements if growing. * * @param count The desired number of elements. **/ void resize(size_type count) { size_type sz = rawSize(); if (count > sz) { ensureCapacity(count); T* ptr = data(); for (size_type i = sz; i < count; ++i) { new (ptr + i) T(); } setSize(count); } else if (count < sz) { T* ptr = data(); for (size_type i = count; i < sz; ++i) { ptr[i].~T(); } setSize(count); } } /** * Resize the vector to @p count elements, copy-constructing new elements from @p value if * growing. * * @param count The desired number of elements. * @param value The value to copy into new elements. **/ void resize(size_type count, const T& value) { size_type sz = rawSize(); if (count > sz) { ensureCapacity(count); T* ptr = data(); for (size_type i = sz; i < count; ++i) { new (ptr + i) T(value); } setSize(count); } else if (count < sz) { T* ptr = data(); for (size_type i = count; i < sz; ++i) { ptr[i].~T(); } setSize(count); } } /** * Erase the element at @p pos. * Elements after @p pos are shifted left. Returns an iterator to the element that now * occupies the erased position (or end() if the last element was erased). * * @param pos Iterator to the element to erase. * @return Iterator to the element following the erased one. **/ iterator erase(const_iterator pos) { T* ptr = data(); size_type sz = rawSize(); size_type index = pos - ptr; for (size_type i = index; i + 1 < sz; ++i) { ptr[i] = std::move(ptr[i + 1]); } ptr[sz - 1].~T(); --size_; // Preserves heap bit return data() + index; } private: // High bit of size_ tracks heap vs inline mode. // Since kHeapBit >> N, comparisons like size_ <= N and size_ < N // are naturally false when the heap bit is set, so isInline() and // many internal checks work without masking. static constexpr size_type kHeapBit = size_type(1) << (sizeof(size_type) * 8 - 1); static constexpr size_type kSizeMask = ~kHeapBit; bool isInline() const noexcept { return (size_ & kHeapBit) == 0; } size_type rawSize() const noexcept { return size_ & kSizeMask; } // Set the size portion, preserving the heap bit. void setSize(size_type s) noexcept { assert((s & kHeapBit) == 0 && "Size overflow into heap bit"); size_ = (size_ & kHeapBit) | s; } T* inlineData() noexcept { return reinterpret_cast(&storage_.inline_); } const T* inlineData() const noexcept { return reinterpret_cast(&storage_.inline_); } void destroyAll() noexcept { T* ptr = data(); size_type sz = rawSize(); for (size_type i = 0; i < sz; ++i) { ptr[i].~T(); } if (!isInline()) { ::operator delete(storage_.heap_.ptr); } } // Grow to heap storage with the specified capacity. // Moves existing elements, frees old heap if applicable, sets heap bit. void growToHeap(size_type newCap) { T* newData = static_cast(::operator new(newCap * sizeof(T))); T* oldData = data(); size_type sz = rawSize(); for (size_type i = 0; i < sz; ++i) { new (newData + i) T(std::move(oldData[i])); oldData[i].~T(); } if (!isInline()) { ::operator delete(storage_.heap_.ptr); } storage_.heap_.ptr = newData; storage_.heap_.capacity = newCap; size_ = kHeapBit | sz; } void ensureCapacity(size_type newCap) { if (newCap <= N && isInline()) { return; } if (isInline()) { growToHeap(newCap); } else if (newCap > storage_.heap_.capacity) { growToHeap(newCap); } } size_type size_; struct HeapStorage { T* ptr; size_type capacity; }; union Storage { alignas(T) unsigned char inline_[sizeof(T) * N]; HeapStorage heap_; Storage() noexcept {} ~Storage() {} } storage_; }; } // namespace dispenso ================================================ FILE: dispenso/spsc_ring_buffer.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * @file spsc_ring_buffer.h * @ingroup group_util * A lock-free single-producer single-consumer (SPSC) ring buffer. * * This buffer is designed for high-performance communication between exactly one producer * thread and one consumer thread. It uses a fixed-capacity circular buffer with atomic * head and tail pointers, providing wait-free operations in the common case. * * @note This implementation is NOT thread-safe for multiple producers or multiple consumers. * Use only with exactly one producer thread and one consumer thread. **/ #pragma once #include #include #include #include #include #include #include namespace dispenso { // TODO(bbudge): Add kDynamicCapacity specialization that heap-allocates storage // with runtime-determined capacity. /** * @class SPSCRingBuffer * @brief A lock-free single-producer single-consumer ring buffer with fixed capacity. * * This class implements a bounded, lock-free ring buffer optimized for the case where * there is exactly one producer thread and exactly one consumer thread. It uses relaxed * memory ordering where possible and acquire/release semantics only where necessary * for correctness. * * The buffer stores elements in a contiguous array, avoiding dynamic memory allocation * after construction. The capacity is fixed at compile time via a template parameter. * * Unlike std::array-based implementations, this buffer does NOT require the element type * to be default-constructible. Elements are constructed in-place when pushed and destroyed * when popped. * * @tparam T The type of elements stored in the buffer. Must be move-constructible. * @tparam Capacity The minimum number of elements the buffer can hold. Must be at least 1. * Defaults to 16, which provides good cache locality. * @tparam RoundUpToPowerOfTwo If true (default), rounds up the internal buffer size to the * next power of two for faster index wrap-around using bitwise * AND instead of modulo. This may result in actual capacity being * larger than requested. Set to false to use exactly the requested * capacity. * * ## Thread Safety * * This class is designed for exactly one producer thread and one consumer thread: * - Only one thread may call `try_push()` or `try_emplace()` at any time * - Only one thread may call `try_pop()` at any time * - The producer and consumer may be different threads * - `empty()`, `full()`, and `size()` may be called from any thread, but provide * only a snapshot that may be immediately stale * * ## Memory Ordering * * The implementation uses: * - `memory_order_relaxed` for local reads of head/tail * - `memory_order_acquire` when reading the "other" index (consumer reads head, producer * reads tail) * - `memory_order_release` when updating head/tail after successful push/pop * * ## Performance Characteristics * * - Push: O(1), wait-free when buffer is not full * - Pop: O(1), wait-free when buffer is not empty * - Memory: sizeof(T) * (actual capacity + 1) + 2 cache lines for head/tail indices * - When RoundUpToPowerOfTwo is true (default), index wrap-around uses fast bitwise AND * * ## Example Usage * * @code * // Default: rounds up to power of two for performance * dispenso::SPSCRingBuffer buffer; // actual capacity is 127 (128 - 1) * * // Exact capacity mode (no rounding) * dispenso::SPSCRingBuffer exact; // actual capacity is 100 * * // Producer thread * if (buffer.try_push(42)) { * // Success * } * * // Consumer thread * int value; * if (buffer.try_pop(value)) { * // Use value * } * @endcode * * @see https://www.1024cores.net/home/lock-free-algorithms/queues/bounded-mpmc-queue * for background on lock-free queue algorithms */ template class SPSCRingBuffer { static_assert(Capacity >= 1, "SPSCRingBuffer capacity must be at least 1"); static_assert( std::is_move_constructible::value, "SPSCRingBuffer element type must be move-constructible"); public: /** * @brief The type of elements stored in this buffer. */ using value_type = T; /** * @brief The size type used for indices and counts. */ using size_type = size_t; /** * @brief Constructs an empty ring buffer. * * The buffer is initialized with no elements. The internal storage is * uninitialized - elements are only constructed when pushed. * * @note Construction is not thread-safe. Ensure the buffer is fully * constructed before any thread accesses it. */ SPSCRingBuffer() = default; /** * @brief Ring buffers are not copyable. * * Copying a concurrent data structure would require synchronization * and could lead to subtle bugs. Use move semantics if you need to * transfer ownership. */ SPSCRingBuffer(const SPSCRingBuffer&) = delete; /** * @brief Ring buffers are not copy-assignable. * @see SPSCRingBuffer(const SPSCRingBuffer&) */ SPSCRingBuffer& operator=(const SPSCRingBuffer&) = delete; /** * @brief Ring buffers are not movable. * * Moving a ring buffer while producer/consumer threads are active * would be unsafe. If you need to transfer ownership, ensure all * threads have stopped first. */ SPSCRingBuffer(SPSCRingBuffer&&) = delete; /** * @brief Ring buffers are not move-assignable. * @see SPSCRingBuffer(SPSCRingBuffer&&) */ SPSCRingBuffer& operator=(SPSCRingBuffer&&) = delete; /** * @brief Destroys the ring buffer. * * All elements remaining in the buffer are destroyed. Ensure no * producer or consumer threads are accessing the buffer when it * is destroyed. * * @note Destruction is not thread-safe. */ ~SPSCRingBuffer() { // Destroy any remaining elements size_t head = head_.load(std::memory_order_relaxed); size_t tail = tail_.load(std::memory_order_relaxed); while (head != tail) { elementAt(head)->~T(); head = increment(head); } } /** * @brief Attempts to push an element into the buffer by moving. * * If the buffer has space, the element is moved into the buffer and * the function returns true. If the buffer is full, the function * returns false and the element is unchanged. * * @param item The element to push (will be moved from on success). * @return true if the element was successfully pushed, false if the buffer was full. * * @note Only one thread may call this function at any time (single producer). * @note This operation is wait-free. * * ## Example * @code * SPSCRingBuffer buffer; * std::string msg = "hello"; * if (buffer.try_push(std::move(msg))) { * // msg is now empty (moved from) * } else { * // msg is unchanged, buffer was full * } * @endcode */ bool try_push(T&& item) { const size_t currentTail = tail_.load(std::memory_order_relaxed); const size_t nextTail = increment(currentTail); // Check if buffer is full if (nextTail == head_.load(std::memory_order_acquire)) { return false; } // Construct element in-place new (elementAt(currentTail)) T(std::move(item)); tail_.store(nextTail, std::memory_order_release); return true; } /** * @brief Attempts to push an element into the buffer by copying. * * If the buffer has space, the element is copied into the buffer and * the function returns true. If the buffer is full, the function * returns false. * * @param item The element to push (will be copied). * @return true if the element was successfully pushed, false if the buffer was full. * * @note Only one thread may call this function at any time (single producer). * @note This operation is wait-free. * @note Prefer try_push(T&&) when the source element is no longer needed. */ bool try_push(const T& item) { const size_t currentTail = tail_.load(std::memory_order_relaxed); const size_t nextTail = increment(currentTail); // Check if buffer is full if (nextTail == head_.load(std::memory_order_acquire)) { return false; } // Construct element in-place via copy new (elementAt(currentTail)) T(item); tail_.store(nextTail, std::memory_order_release); return true; } /** * @brief Attempts to construct an element in-place in the buffer. * * If the buffer has space, constructs an element directly in the buffer * storage using the provided arguments, avoiding any copy or move operations. * * @tparam Args The types of arguments to forward to T's constructor. * @param args The arguments to forward to the element constructor. * @return true if the element was successfully emplaced, false if the buffer was full. * * @note Only one thread may call this function at any time (single producer). * @note This operation is wait-free. * * ## Example * @code * SPSCRingBuffer, 4> buffer; * if (buffer.try_emplace(42, "hello")) { * // Element constructed in-place * } * @endcode */ template bool try_emplace(Args&&... args) { const size_t currentTail = tail_.load(std::memory_order_relaxed); const size_t nextTail = increment(currentTail); // Check if buffer is full if (nextTail == head_.load(std::memory_order_acquire)) { return false; } // Construct element in-place new (elementAt(currentTail)) T(std::forward(args)...); tail_.store(nextTail, std::memory_order_release); return true; } /** * @brief Attempts to pop an element from the buffer. * * If the buffer has elements, moves the front element into the output * parameter and returns true. If the buffer is empty, returns false * and leaves the output parameter unchanged. * * @param[out] item The location to move the popped element to. * @return true if an element was successfully popped, false if the buffer was empty. * * @note Only one thread may call this function at any time (single consumer). * @note This operation is wait-free. * * ## Example * @code * SPSCRingBuffer buffer; * buffer.try_push(42); * * int value; * if (buffer.try_pop(value)) { * assert(value == 42); * } * @endcode */ bool try_pop(T& item) { const size_t currentHead = head_.load(std::memory_order_relaxed); // Check if buffer is empty if (currentHead == tail_.load(std::memory_order_acquire)) { return false; } T* elem = elementAt(currentHead); item = std::move(*elem); elem->~T(); head_.store(increment(currentHead), std::memory_order_release); return true; } /** * @brief Attempts to pop an element from the buffer, returning an optional. * * If the buffer has elements, moves the front element into an OpResult * and returns it. If the buffer is empty, returns an empty OpResult. * * This provides a cleaner API than try_pop(T&) when default-constructibility * of T is not guaranteed or when a more functional style is preferred. * * @return An OpResult containing the popped element, or an empty OpResult * if the buffer was empty. * * @note Only one thread may call this function at any time (single consumer). * @note This operation is wait-free. * @note In C++17 and beyond, you may prefer to use std::optional directly. * * ## Example * @code * SPSCRingBuffer buffer; * buffer.try_push("hello"); * * if (auto result = buffer.try_pop()) { * std::cout << result.value() << std::endl; * } * @endcode */ OpResult try_pop() { const size_t currentHead = head_.load(std::memory_order_relaxed); // Check if buffer is empty if (currentHead == tail_.load(std::memory_order_acquire)) { return {}; } T* elem = elementAt(currentHead); OpResult result(std::move(*elem)); elem->~T(); head_.store(increment(currentHead), std::memory_order_release); return result; } /** * @brief Attempts to pop an element into uninitialized storage. * * Similar to try_pop, but uses placement new to construct the element * into the provided storage. This is useful when T is not default-constructible. * * @param[out] storage Pointer to uninitialized storage where the element will be * move-constructed. Must have proper alignment for T. * @return true if an element was successfully popped, false if the buffer was empty. * * @note Only one thread may call this function at any time (single consumer). * @note The caller is responsible for eventually destroying the constructed object. * @note This operation is wait-free. * * ## Example * @code * SPSCRingBuffer buffer; * alignas(NonDefaultConstructible) char storage[sizeof(NonDefaultConstructible)]; * if (buffer.try_pop_into(reinterpret_cast(storage))) { * auto* ptr = reinterpret_cast(storage); * // use *ptr * ptr->~NonDefaultConstructible(); * } * @endcode */ bool try_pop_into(T* storage) { const size_t currentHead = head_.load(std::memory_order_relaxed); // Check if buffer is empty if (currentHead == tail_.load(std::memory_order_acquire)) { return false; } T* elem = elementAt(currentHead); new (storage) T(std::move(*elem)); elem->~T(); head_.store(increment(currentHead), std::memory_order_release); return true; } /** * @brief Attempts to push multiple elements into the buffer. * * Pushes as many elements as possible from the range [first, last) into the buffer * in a single atomic tail update. This reduces atomic operation overhead when * pushing multiple items. * * @tparam InputIt Input iterator type. Must dereference to a type convertible to T. * @param first Iterator to the first element to push. * @param last Iterator past the last element to push. * @return The number of elements successfully pushed. May be less than the * range size if the buffer becomes full. * * @note Only one thread may call this function at any time (single producer). * @note Elements are moved from the input range. * * ## Example * @code * SPSCRingBuffer buffer; * std::vector items = {1, 2, 3, 4, 5}; * size_t pushed = buffer.try_push_batch(items.begin(), items.end()); * // pushed contains the number of items successfully added * @endcode */ template size_type try_push_batch(InputIt first, InputIt last) { const size_t currentTail = tail_.load(std::memory_order_relaxed); const size_t currentHead = head_.load(std::memory_order_acquire); // Calculate available space (actual capacity is kBufferSize - 1) size_t available; if (currentTail >= currentHead) { // Tail is ahead of or at head: available = capacity - (tail - head) available = (kBufferSize - 1) - (currentTail - currentHead); } else { // Tail has wrapped: available = head - tail - 1 available = currentHead - currentTail - 1; } if (available == 0) { return 0; } // Push as many items as possible size_t count = 0; size_t tailPos = currentTail; for (; first != last && count < available; ++first, ++count) { new (elementAt(tailPos)) T(std::move(*first)); tailPos = increment(tailPos); } if (count > 0) { tail_.store(tailPos, std::memory_order_release); } return count; } /** * @brief Attempts to pop multiple elements from the buffer. * * Pops up to maxCount elements from the buffer into the output iterator * in a single atomic head update. This reduces atomic operation overhead * when consuming multiple items. * * @tparam OutputIt Output iterator type. Must be assignable from T. * @param dest Output iterator to write popped elements to. * @param maxCount Maximum number of elements to pop. * @return The number of elements successfully popped. May be less than * maxCount if the buffer has fewer elements. * * @note Only one thread may call this function at any time (single consumer). * @note Elements are moved to the output range. * * ## Example * @code * SPSCRingBuffer buffer; * // ... push some items ... * std::vector items(4); * size_t popped = buffer.try_pop_batch(items.begin(), 4); * // First 'popped' elements of items contain the popped values * @endcode */ template size_type try_pop_batch(OutputIt dest, size_type maxCount) { const size_t currentHead = head_.load(std::memory_order_relaxed); const size_t currentTail = tail_.load(std::memory_order_acquire); // Calculate available items size_t available; if (currentTail >= currentHead) { available = currentTail - currentHead; } else { available = kBufferSize - currentHead + currentTail; } if (available == 0) { return 0; } // Pop as many items as requested and available size_t count = std::min(available, maxCount); size_t headPos = currentHead; for (size_t i = 0; i < count; ++i, ++dest) { T* elem = elementAt(headPos); *dest = std::move(*elem); elem->~T(); headPos = increment(headPos); } if (count > 0) { head_.store(headPos, std::memory_order_release); } return count; } /** * @brief Checks if the buffer is empty. * * @return true if the buffer contains no elements, false otherwise. * * @note This function provides only a snapshot of the buffer state. * The result may be stale by the time it is used, as another * thread may have pushed or popped an element. * * @note Safe to call from any thread, but the result is only a hint. */ bool empty() const { return head_.load(std::memory_order_acquire) == tail_.load(std::memory_order_acquire); } /** * @brief Checks if the buffer is full. * * @return true if the buffer has no space for additional elements, false otherwise. * * @note This function provides only a snapshot of the buffer state. * The result may be stale by the time it is used, as another * thread may have popped an element. * * @note Safe to call from any thread, but the result is only a hint. */ bool full() const { return increment(tail_.load(std::memory_order_acquire)) == head_.load(std::memory_order_acquire); } /** * @brief Returns the current number of elements in the buffer. * * @return The number of elements currently in the buffer. * * @note This function provides only a snapshot of the buffer state. * The result may be stale by the time it is used. * * @note Safe to call from any thread, but the result is only a hint. * * @note The implementation handles wrap-around correctly using modular * arithmetic. */ size_type size() const { const size_t head = head_.load(std::memory_order_acquire); const size_t tail = tail_.load(std::memory_order_acquire); // Handle wrap-around: if tail < head, we've wrapped return (tail >= head) ? (tail - head) : (kBufferSize - head + tail); } /** * @brief Returns the maximum number of elements the buffer can hold. * * When RoundUpToPowerOfTwo is true (default), this may be larger than the * requested Capacity template parameter, as the internal buffer is rounded * up to the next power of two for performance. * * @return The actual maximum capacity of the buffer (kBufferSize - 1). * * @note The actual storage uses capacity() + 1 slots internally to distinguish * between empty and full states. */ static constexpr size_type capacity() noexcept { return kBufferSize - 1; } private: /** * @brief Computes the internal buffer size at compile time. * * When RoundUpToPowerOfTwo is true, rounds (Capacity + 1) up to the next power of two. * Otherwise, uses exactly (Capacity + 1). */ static constexpr size_t computeBufferSize() noexcept { return RoundUpToPowerOfTwo ? static_cast(detail::nextPow2(Capacity + 1)) : Capacity + 1; } /** * @brief Internal buffer size (includes one extra slot to distinguish empty from full). * * The ring buffer uses one slot as a sentinel to distinguish between the * empty state (head == tail) and the full state (next(tail) == head). * When RoundUpToPowerOfTwo is true, this is rounded up to a power of two. */ static constexpr size_t kBufferSize = computeBufferSize(); /** * @brief Compile-time check if kBufferSize is a power of two. * * When true, we can use faster bitwise AND instead of modulo for wrap-around. */ static constexpr bool kIsPowerOfTwo = (kBufferSize & (kBufferSize - 1)) == 0; /** * @brief Mask for power-of-two wrap-around (kBufferSize - 1). * * Only valid when kIsPowerOfTwo is true. */ static constexpr size_t kMask = kBufferSize - 1; /** * @brief Increments an index with wrap-around. * * Uses bitwise AND when buffer size is a power of two (faster), * otherwise falls back to modulo. * * @param index The current index. * @return The next index, wrapping to 0 if necessary. */ static constexpr size_t increment(size_t index) noexcept { return kIsPowerOfTwo ? ((index + 1) & kMask) : ((index + 1) % kBufferSize); } /** * @brief Returns a pointer to the element at the given index. */ T* elementAt(size_t index) { return reinterpret_cast(&storage_[index * sizeof(T)]); } const T* elementAt(size_t index) const { return reinterpret_cast(&storage_[index * sizeof(T)]); } /// Head index (consumer reads from here). Cache-line aligned to avoid false sharing. alignas(kCacheLineSize) std::atomic head_{0}; /// Tail index (producer writes here). Cache-line aligned to avoid false sharing. alignas(kCacheLineSize) std::atomic tail_{0}; /// Uninitialized storage for elements. Uses Capacity + 1 slots to distinguish empty from full. /// Elements are constructed via placement new when pushed and explicitly destroyed when popped. alignas(T) char storage_[sizeof(T) * kBufferSize]; }; } // namespace dispenso ================================================ FILE: dispenso/task_set.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include "task_set.h" #include namespace dispenso { namespace detail { // 64 depth is pretty ridiculous, but try not to step on anyone's feet. constexpr int32_t kMaxTasksStackSize = 64; DISPENSO_THREAD_LOCAL TaskSetBase* g_taskStack[kMaxTasksStackSize]; DISPENSO_THREAD_LOCAL int32_t g_taskStackSize = 0; void pushThreadTaskSet(TaskSetBase* t) { #ifndef NDEBUG if (g_taskStackSize < 0 || g_taskStackSize >= kMaxTasksStackSize) { fprintf(stderr, "TaskSet parent stack index is invalid when pushing: %d\n", g_taskStackSize); std::abort(); } #endif // NDEBUG g_taskStack[g_taskStackSize++] = t; } void popThreadTaskSet() { #ifndef NDEBUG if (g_taskStackSize <= 0) { fprintf(stderr, "TaskSet parent stack index is invalid when popping: %d\n", g_taskStackSize); std::abort(); } #endif // NDEBUG --g_taskStackSize; } } // namespace detail TaskSetBase* parentTaskSet() { using namespace detail; #ifndef NDEBUG if (g_taskStackSize < 0 || g_taskStackSize >= kMaxTasksStackSize) { fprintf(stderr, "TaskSet parent stack index is invalid when accessing: %d\n", g_taskStackSize); std::abort(); } #endif // NDEBUG return g_taskStackSize ? g_taskStack[g_taskStackSize - 1] : nullptr; } void TaskSetBase::trySetCurrentException() { #if defined(__cpp_exceptions) auto status = kUnset; if (guardException_.compare_exchange_strong(status, kSetting, std::memory_order_acq_rel)) { exception_ = std::current_exception(); guardException_.store(kSet, std::memory_order_release); canceled_.store(true, std::memory_order_release); } #endif // __cpp_exceptions } inline bool TaskSetBase::testAndResetException() { #if defined(__cpp_exceptions) if (guardException_.load(std::memory_order_acquire) == kSet) { auto exception = std::move(exception_); guardException_.store(kUnset, std::memory_order_release); std::rethrow_exception(exception); } #endif // __cpp_exceptions return canceled_.load(std::memory_order_acquire); } bool ConcurrentTaskSet::wait() { // Steal work until our set is unblocked. Note that this is not the // fastest possible way to unblock the current set, but it will alleviate // deadlock, and should provide decent throughput for all waiters. // The deadlock scenario mentioned goes as follows: N threads in the // ThreadPool. Each thread is running code that is using TaskSets. No // progress could be made without stealing. while (outstandingTaskCount_.load(std::memory_order_acquire)) { if (!pool_.tryExecuteNext()) { std::this_thread::yield(); } } return testAndResetException(); } bool ConcurrentTaskSet::tryWait(size_t maxToExecute) { while (outstandingTaskCount_.load(std::memory_order_acquire) && maxToExecute--) { if (!pool_.tryExecuteNext()) { break; } } // Must check completion prior to checking exceptions, otherwise there could be a case where // exceptions are checked, then an exception is propagated, and then we return whether all items // have been completed, thus dropping the exception. if (outstandingTaskCount_.load(std::memory_order_acquire)) { return false; } return !testAndResetException(); } moodycamel::ProducerToken TaskSet::makeToken(moodycamel::ConcurrentQueue& pool) { return moodycamel::ProducerToken(pool); } bool TaskSet::wait() { // Steal work until our set is unblocked. // The deadlock scenario mentioned goes as follows: N threads in the // ThreadPool. Each thread is running code that is using TaskSets. No // progress could be made without stealing. while (pool_.tryExecuteNextFromProducerToken(token_)) { } while (outstandingTaskCount_.load(std::memory_order_acquire)) { if (!pool_.tryExecuteNext()) { std::this_thread::yield(); } } return testAndResetException(); } bool TaskSet::tryWait(size_t maxToExecute) { ssize_t maxToExe = static_cast(maxToExecute); while (outstandingTaskCount_.load(std::memory_order_acquire) && maxToExe--) { if (!pool_.tryExecuteNextFromProducerToken(token_)) { break; } } // Must check completion prior to checking exceptions, otherwise there could be a case where // exceptions are checked, then an exception is propagated, and then we return whether all items // have been completed, thus dropping the exception. maxToExe = std::max(0, maxToExe); while (outstandingTaskCount_.load(std::memory_order_acquire) && maxToExe--) { if (!pool_.tryExecuteNext()) { std::this_thread::yield(); } } if (outstandingTaskCount_.load(std::memory_order_acquire)) { return false; } return !testAndResetException(); } } // namespace dispenso ================================================ FILE: dispenso/task_set.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * @file task_set.h * @ingroup group_core * A file providing TaskSet and ConcurrentTaskSet. These interfaces allow the user to * submit/schedule multiple closures and then wait on them. **/ #pragma once namespace dispenso { enum class ParentCascadeCancel { kOff, kOn }; } #include namespace dispenso { constexpr ssize_t kDefaultStealingMultiplier = 4; /** * TaskSet is an object that allows scheduling multiple functors to a thread pool, and * allows to wait on that set of tasks. TaskSet supplies more efficient schedule/wait * than ConcurrentTaskSet, but at the expense of only being usable from one thread at a * time. * * TaskSet is "thread-compatible". This means that you can safely use * different TaskSet objects on different threads concurrently. Any given * TaskSet object may only be used from a single thread, so no concurrent use of that * object is allowed. **/ class TaskSet : public TaskSetBase { public: /** * Construct a TaskSet with the given backing pool. * * @param p The backing pool for this TaskSet * @param registerForParentCancel Whether to register for parent cancellation cascade. * @param stealingLoadMultiplier An over-load factor. If this factor of load is reached by the * underlying pool, scheduled tasks may run immediately in the calling thread. **/ TaskSet( ThreadPool& p, ParentCascadeCancel registerForParentCancel, ssize_t stealingLoadMultiplier = kDefaultStealingMultiplier) : TaskSetBase(p, registerForParentCancel, stealingLoadMultiplier), token_(makeToken(p.work_)) {} /** Construct a TaskSet with default options. @param p The backing pool. */ TaskSet(ThreadPool& p) : TaskSet(p, ParentCascadeCancel::kOff, kDefaultStealingMultiplier) {} /** Construct a TaskSet with custom load multiplier. @param p The backing pool. @param * stealingLoadMultiplier The over-load factor. */ TaskSet(ThreadPool& p, ssize_t stealingLoadMultiplier) : TaskSet(p, ParentCascadeCancel::kOff, stealingLoadMultiplier) {} TaskSet(TaskSet&& other) = delete; TaskSet& operator=(TaskSet&& other) = delete; /** * Schedule a functor for execution on the underlying pool. If the load on the * underlying pool is high, immediate inline execution may occur on the current thread. * * @param f A functor matching signature void(). Best performance will come from * passing lambdas, other concrete functors, or OnceFunction, but * std::function or similarly type-erased objects will also work. * * @note If f can throw exceptions, then schedule may throw if the task * is run inline. Otherwise, exceptions will be caught on the running thread and best-effort * propagated to the ConcurrentTaskSet, where the first one from the set is rethrown * in wait. **/ template DISPENSO_REQUIRES(OnceCallableFunc) void schedule(F&& f) { if (DISPENSO_EXPECT(canceled(), false)) { return; } if (outstandingTaskCount_.load(std::memory_order_relaxed) > taskSetLoadFactor_) { f(); } else { pool_.schedule(token_, packageTask(std::forward(f))); } } /** * Schedule a functor for execution on the underlying pool. * * @param f A functor matching signature void(). Best performance will come from * passing lambdas, other concrete functors, or OnceFunction, but * std::function or similarly type-erased objects will also work. * @param fq Tag to force queuing instead of potential inline execution. * * @note If f can throw exceptions, then exceptions will be caught on the running * thread and best-effort propagated to the ConcurrentTaskSet, where the first one * from the set is rethrown in wait. **/ template DISPENSO_REQUIRES(OnceCallableFunc) void schedule(F&& f, ForceQueuingTag fq) { pool_.schedule(token_, packageTask(std::forward(f)), fq); } /** * Schedule multiple functors for execution on the underlying pool in bulk. * This is more efficient than calling schedule() multiple times when you have many tasks to * submit, as it reduces atomic contention and allows for better thread wakeup behavior. * * @param count The number of functors to schedule. * @param gen A generator functor that takes an index and returns a functor to execute. * gen(i) will be called for i in [0, count) to produce each task. * * @note Work is processed in chunks, interleaving enqueue and inline execution based on the * task set's load factor, preventing both pool thread starvation and excessive overhead. **/ template void scheduleBulk(size_t count, Generator&& gen) { scheduleBulkImpl(count, std::forward(gen), &token_); } /** * Wait for all currently scheduled functors to finish execution. If exceptions are thrown * during execution of the set of tasks, wait will propagate the first exception. * * @return true if the TaskSet was canceled, false otherwise **/ DISPENSO_DLL_ACCESS bool wait(); /** * See if the currently scheduled functors can be completed while stealing and executing at most * maxToExecute of them from the pool. If not used in conjunction with wait, there * may be cases that tryWait must be called multiple times with * maxToExecute > 0 to prevent livelock/deadlock. If exceptions have been * propagated since the last call to wait or tryWait, * tryWait will propagate the first of them. * * @param maxToExecute The maximum number of tasks to proactively execute on the current thread. * * @return true if all currently scheduled functors have been completed prior to * returning, and false otherwise. This includes returning false if the TaskSet was * cancelled. **/ DISPENSO_DLL_ACCESS bool tryWait(size_t maxToExecute); /** * Set the TaskSet to canceled state. No unexecuted tasks will execute once this is set. * Already executing tasks may check canceled() status to exit early. * **/ void cancel() { TaskSetBase::cancel(); } /** * Check the canceled status of the TaskSet. * * @return a boolean indicating whether or not the TaskSet has been canceled. **/ bool canceled() const { return TaskSetBase::canceled(); } /** * Destroy the TaskSet, first waiting for all currently scheduled functors to * finish execution. **/ ~TaskSet() { wait(); } private: DISPENSO_DLL_ACCESS moodycamel::ProducerToken makeToken( moodycamel::ConcurrentQueue& pool); moodycamel::ProducerToken token_; template friend class detail::FutureBase; }; /** * ConcurrentTaskSet fulfills the same API as TaskSet with one minor * difference: It may be used to schedule tasks concurrently from multiple threads (see more below). * It is an object that allows scheduling multiple function-like objects to a thread pool, and * allows to wait on that set of tasks. * * ConcurrentTaskSet is "thread-compatible". This means that you can safely use * different ConcurrentTaskSet objects on different threads concurrently. * ConcurrentTaskSet also allows multiple threads to concurrently schedule against it. * It is an error to call wait() concurrently with schedule() on the same * ConcurrentTaskSet. */ class ConcurrentTaskSet : public TaskSetBase { public: /** * Construct a ConcurrentTaskSet with the given backing pool. * * @param pool The backing pool for this ConcurrentTaskSet * @param registerForParentCancel Whether to register for parent cancellation cascade. * @param stealingLoadMultiplier An over-load factor. If this factor of load is reached by the * underlying pool, scheduled tasks may run immediately in the calling thread. **/ ConcurrentTaskSet( ThreadPool& pool, ParentCascadeCancel registerForParentCancel, ssize_t stealingLoadMultiplier = kDefaultStealingMultiplier) : TaskSetBase(pool, registerForParentCancel, stealingLoadMultiplier) {} /** Construct a ConcurrentTaskSet with default options. @param p The backing pool. */ ConcurrentTaskSet(ThreadPool& p) : ConcurrentTaskSet(p, ParentCascadeCancel::kOff, kDefaultStealingMultiplier) {} /** Construct a ConcurrentTaskSet with custom load multiplier. @param p The backing pool. @param * stealingLoadMultiplier The over-load factor. */ ConcurrentTaskSet(ThreadPool& p, ssize_t stealingLoadMultiplier) : ConcurrentTaskSet(p, ParentCascadeCancel::kOff, stealingLoadMultiplier) {} ConcurrentTaskSet(ConcurrentTaskSet&& other) = delete; ConcurrentTaskSet& operator=(ConcurrentTaskSet&& other) = delete; /** * Schedule a functor for execution on the underlying pool. If the load on the * underlying pool is high, immediate inline execution may occur on the current thread. * * @param f A functor matching signature void(). Best performance will come from * passing lambdas, other concrete functors, or OnceFunction, but * std::function or similarly type-erased objects will also work. * * @param skipRecheck A poweruser knob that says that if we don't have enough outstanding tasks to * immediately work steal, we should bypass the similar check in the ThreadPool. * * @note If f can throw exceptions, then schedule may throw if the task * is run inline. Otherwise, exceptions will be caught on the running thread and best-effort * propagated to the ConcurrentTaskSet, where the first one from the set is rethrown * in wait. **/ template DISPENSO_REQUIRES(OnceCallableFunc) void schedule(F&& f, bool skipRecheck = false) { if (outstandingTaskCount_.load(std::memory_order_relaxed) > taskSetLoadFactor_ && DISPENSO_EXPECT(!canceled(), true)) { f(); } else if (skipRecheck) { pool_.schedule(packageTask(std::forward(f)), ForceQueuingTag()); } else { pool_.schedule(packageTask(std::forward(f))); } } /** * Schedule a functor for execution on the underlying pool. * * @param f A functor matching signature void(). Best performance will come from * passing lambdas, other concrete functors, or OnceFunction, but * std::function or similarly type-erased objects will also work. * @param fq Tag to force queuing instead of potential inline execution. * * @note If f can throw exceptions, then exceptions will be caught on the running * thread and best-effort propagated to the ConcurrentTaskSet, where the first one * from the set is rethrown in wait. **/ template DISPENSO_REQUIRES(OnceCallableFunc) void schedule(F&& f, ForceQueuingTag fq) { pool_.schedule(packageTask(std::forward(f)), fq); } /** * Schedule multiple functors for execution on the underlying pool in bulk. * This is more efficient than calling schedule() multiple times when you have many tasks to * submit, as it reduces atomic contention and allows for better thread wakeup behavior. * * @param count The number of functors to schedule. * @param gen A generator functor that takes an index and returns a functor to execute. * gen(i) will be called for i in [0, count) to produce each task. * * @note Work is processed in chunks, interleaving enqueue and inline execution based on the * task set's load factor, preventing both pool thread starvation and excessive overhead. **/ template void scheduleBulk(size_t count, Generator&& gen) { scheduleBulkImpl(count, std::forward(gen), nullptr); } /** * Wait for all currently scheduled functors to finish execution. If exceptions are thrown * during execution of the set of tasks, wait will propagate the first exception. **/ DISPENSO_DLL_ACCESS bool wait(); /** * See if the currently scheduled functors can be completed while stealing and executing at most * maxToExecute of them from the pool. If not used in conjunction with wait, there * may be cases that tryWait must be called multiple times with * maxToExecute > 0 to prevent livelock/deadlock. If exceptions have been * propagated since the last call to wait or tryWait, * tryWait will propagate the first of them. * * @param maxToExecute The maximum number of tasks to proactively execute on the current thread. * * @return true if all currently scheduled functors have been completed prior to * returning, and false otherwise (including cancelled cases). **/ DISPENSO_DLL_ACCESS bool tryWait(size_t maxToExecute); /** * Set the ConcurrentTaskSet to canceled state. No unexecuted tasks will execute once this is * set. Already executing tasks may check canceled() status to exit early. * * @note This will be reset automatically by wait. **/ void cancel() { TaskSetBase::cancel(); } /** * Check the canceled status of the ConcurrentTaskSet. * * @return a boolean indicating whether or not the ConcurrentTaskSet has been canceled. **/ bool canceled() const { return TaskSetBase::canceled(); } /** * Destroy the ConcurrentTaskSet, first waiting for all currently scheduled functors to * finish execution. **/ ~ConcurrentTaskSet() { wait(); } private: bool tryExecuteNext() { return pool_.tryExecuteNext(); } template friend class detail::FutureBase; friend class detail::LimitGatedScheduler; }; /** * Get access to the parent task set that scheduled the currently running code. nullptr if called * outside the context of a (Concurrent)TaskSet schedule. * **/ DISPENSO_DLL_ACCESS TaskSetBase* parentTaskSet(); } // namespace dispenso ================================================ FILE: dispenso/third-party/moodycamel/LICENSE.md ================================================ This license file applies to everything in this repository except that which is explicitly annotated as being written by other authors, i.e. the Boost queue (included in the benchmarks for comparison), Intel's TBB library (ditto), dlib::pipe (ditto), the CDSChecker tool (used for verification), the Relacy model checker (ditto), and Jeff Preshing's semaphore implementation (used in the blocking queue) which has a zlib license (embedded in lightweightsempahore.h). --- Simplified BSD License: Copyright (c) 2013-2016, Cameron Desrochers. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. --- I have also chosen to dual-license under the Boost Software License as an alternative to the Simplified BSD license above: Boost Software License - Version 1.0 - August 17th, 2003 Permission is hereby granted, free of charge, to any person or organization obtaining a copy of the software and accompanying documentation covered by this license (the "Software") to use, reproduce, display, distribute, execute, and transmit the Software, and to prepare derivative works of the Software, and to permit third-parties to whom the Software is furnished to do so, all subject to the following: The copyright notices in the Software and this entire statement, including the above license grant, this restriction and the following disclaimer, must be included in all copies of the Software, in whole or in part, and all derivative works of the Software, unless such copies or derivative works are solely in the form of machine-executable object code generated by a source language processor. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: dispenso/third-party/moodycamel/README.txt ================================================ https://github.com/cameron314/concurrentqueue commit 65d6970912fc3f6bb62d80edf95ca30e0df85137 (HEAD -> master, origin/master, origin/HEAD) Merge: d49fa2b 08dcafc Author: Cameron Date: Sun Jul 24 10:02:12 2022 -0400 Merge pull request #308 from r8bhavneet/master Update README.md commit 08dcafcd131b46e1a63abdc9b5f73c852193edca Author: r8bhavneet <98200254+r8bhavneet@users.noreply.github.com> Date: Sun Jul 24 02:35:57 2022 -0700 Update README.md Hey, I really liked the project and was reading through the Readme.md file when I came across some redundant words and phrases which you might have missed whil e editing the documentation. It would be really a great opportunity for me if I could contribute to this project. Thank you. commit d49fa2b0bd1c6185d93509f48c8987f9759d7238 Merge: 0a40449 9dc1b2c Author: Cameron Date: Mon May 9 07:43:29 2022 -0400 Merge pull request #296 from MathiasMagnus/fix-c4554 Proper MSVC warning fix and note commit 9dc1b2cfcad03b4ee22ea57ddb5c453c41c19ac9 Author: Máté Ferenc Nagy-Egri Date: Mon May 9 13:19:39 2022 +0200 Proper MSVC warning fix and note commit 0a404492ac2c0bba0f62eb2b859ec152e494f8bf Author: Cameron Date: Sat May 7 12:04:00 2022 -0400 Attempt to resolve -Wsign-conversion warnings in concurrentqueue.h (see #294) commit 22c78daf65d2c8cce9399a29171676054aa98807 Merge: c52e5ef 263c55d Author: Cameron Date: Sun Mar 20 15:16:30 2022 -0400 Merge pull request #290 from usurai/master Fix link in README commit 263c55d5c95545abee1ef25662c752c5296d7c34 Author: usurai Date: Thu Mar 17 16:09:14 2022 +0800 Fix link in README ================================================ FILE: dispenso/third-party/moodycamel/blockingconcurrentqueue.h ================================================ // Provides an efficient blocking version of moodycamel::ConcurrentQueue. // ©2015-2020 Cameron Desrochers. Distributed under the terms of the simplified // BSD license, available at the top of concurrentqueue.h. // Also dual-licensed under the Boost Software License (see LICENSE.md) // Uses Jeff Preshing's semaphore implementation (under the terms of its // separate zlib license, see lightweightsemaphore.h). #pragma once #include "concurrentqueue.h" #include "lightweightsemaphore.h" #include #include #include #include #include namespace moodycamel { // This is a blocking version of the queue. It has an almost identical interface to // the normal non-blocking version, with the addition of various wait_dequeue() methods // and the removal of producer-specific dequeue methods. template class BlockingConcurrentQueue { private: typedef ::moodycamel::ConcurrentQueue ConcurrentQueue; typedef ::moodycamel::LightweightSemaphore LightweightSemaphore; public: typedef typename ConcurrentQueue::producer_token_t producer_token_t; typedef typename ConcurrentQueue::consumer_token_t consumer_token_t; typedef typename ConcurrentQueue::index_t index_t; typedef typename ConcurrentQueue::size_t size_t; typedef typename std::make_signed::type ssize_t; static const size_t BLOCK_SIZE = ConcurrentQueue::BLOCK_SIZE; static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = ConcurrentQueue::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD; static const size_t EXPLICIT_INITIAL_INDEX_SIZE = ConcurrentQueue::EXPLICIT_INITIAL_INDEX_SIZE; static const size_t IMPLICIT_INITIAL_INDEX_SIZE = ConcurrentQueue::IMPLICIT_INITIAL_INDEX_SIZE; static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = ConcurrentQueue::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = ConcurrentQueue::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE; static const size_t MAX_SUBQUEUE_SIZE = ConcurrentQueue::MAX_SUBQUEUE_SIZE; public: // Creates a queue with at least `capacity` element slots; note that the // actual number of elements that can be inserted without additional memory // allocation depends on the number of producers and the block size (e.g. if // the block size is equal to `capacity`, only a single block will be allocated // up-front, which means only a single producer will be able to enqueue elements // without an extra allocation -- blocks aren't shared between producers). // This method is not thread safe -- it is up to the user to ensure that the // queue is fully constructed before it starts being used by other threads (this // includes making the memory effects of construction visible, possibly with a // memory barrier). explicit BlockingConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE) : inner(capacity), sema(create(0, (int)Traits::MAX_SEMA_SPINS), &BlockingConcurrentQueue::template destroy) { assert(reinterpret_cast((BlockingConcurrentQueue*)1) == &((BlockingConcurrentQueue*)1)->inner && "BlockingConcurrentQueue must have ConcurrentQueue as its first member"); if (!sema) { MOODYCAMEL_THROW(std::bad_alloc()); } } BlockingConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers) : inner(minCapacity, maxExplicitProducers, maxImplicitProducers), sema(create(0, (int)Traits::MAX_SEMA_SPINS), &BlockingConcurrentQueue::template destroy) { assert(reinterpret_cast((BlockingConcurrentQueue*)1) == &((BlockingConcurrentQueue*)1)->inner && "BlockingConcurrentQueue must have ConcurrentQueue as its first member"); if (!sema) { MOODYCAMEL_THROW(std::bad_alloc()); } } // Disable copying and copy assignment BlockingConcurrentQueue(BlockingConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; BlockingConcurrentQueue& operator=(BlockingConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; // Moving is supported, but note that it is *not* a thread-safe operation. // Nobody can use the queue while it's being moved, and the memory effects // of that move must be propagated to other threads before they can use it. // Note: When a queue is moved, its tokens are still valid but can only be // used with the destination queue (i.e. semantically they are moved along // with the queue itself). BlockingConcurrentQueue(BlockingConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT : inner(std::move(other.inner)), sema(std::move(other.sema)) { } inline BlockingConcurrentQueue& operator=(BlockingConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT { return swap_internal(other); } // Swaps this queue's state with the other's. Not thread-safe. // Swapping two queues does not invalidate their tokens, however // the tokens that were created for one queue must be used with // only the swapped queue (i.e. the tokens are tied to the // queue's movable state, not the object itself). inline void swap(BlockingConcurrentQueue& other) MOODYCAMEL_NOEXCEPT { swap_internal(other); } private: BlockingConcurrentQueue& swap_internal(BlockingConcurrentQueue& other) { if (this == &other) { return *this; } inner.swap(other.inner); sema.swap(other.sema); return *this; } public: // Enqueues a single item (by copying it). // Allocates memory if required. Only fails if memory allocation fails (or implicit // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). // Thread-safe. inline bool enqueue(T const& item) { if ((details::likely)(inner.enqueue(item))) { sema->signal(); return true; } return false; } // Enqueues a single item (by moving it, if possible). // Allocates memory if required. Only fails if memory allocation fails (or implicit // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). // Thread-safe. inline bool enqueue(T&& item) { if ((details::likely)(inner.enqueue(std::move(item)))) { sema->signal(); return true; } return false; } // Enqueues a single item (by copying it) using an explicit producer token. // Allocates memory if required. Only fails if memory allocation fails (or // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). // Thread-safe. inline bool enqueue(producer_token_t const& token, T const& item) { if ((details::likely)(inner.enqueue(token, item))) { sema->signal(); return true; } return false; } // Enqueues a single item (by moving it, if possible) using an explicit producer token. // Allocates memory if required. Only fails if memory allocation fails (or // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). // Thread-safe. inline bool enqueue(producer_token_t const& token, T&& item) { if ((details::likely)(inner.enqueue(token, std::move(item)))) { sema->signal(); return true; } return false; } // Enqueues several items. // Allocates memory if required. Only fails if memory allocation fails (or // implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE // is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). // Note: Use std::make_move_iterator if the elements should be moved instead of copied. // Thread-safe. template inline bool enqueue_bulk(It itemFirst, size_t count) { if ((details::likely)(inner.enqueue_bulk(std::forward(itemFirst), count))) { sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count); return true; } return false; } // Enqueues several items using an explicit producer token. // Allocates memory if required. Only fails if memory allocation fails // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). // Note: Use std::make_move_iterator if the elements should be moved // instead of copied. // Thread-safe. template inline bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) { if ((details::likely)(inner.enqueue_bulk(token, std::forward(itemFirst), count))) { sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count); return true; } return false; } // Enqueues a single item (by copying it). // Does not allocate memory. Fails if not enough room to enqueue (or implicit // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE // is 0). // Thread-safe. inline bool try_enqueue(T const& item) { if (inner.try_enqueue(item)) { sema->signal(); return true; } return false; } // Enqueues a single item (by moving it, if possible). // Does not allocate memory (except for one-time implicit producer). // Fails if not enough room to enqueue (or implicit production is // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). // Thread-safe. inline bool try_enqueue(T&& item) { if (inner.try_enqueue(std::move(item))) { sema->signal(); return true; } return false; } // Enqueues a single item (by copying it) using an explicit producer token. // Does not allocate memory. Fails if not enough room to enqueue. // Thread-safe. inline bool try_enqueue(producer_token_t const& token, T const& item) { if (inner.try_enqueue(token, item)) { sema->signal(); return true; } return false; } // Enqueues a single item (by moving it, if possible) using an explicit producer token. // Does not allocate memory. Fails if not enough room to enqueue. // Thread-safe. inline bool try_enqueue(producer_token_t const& token, T&& item) { if (inner.try_enqueue(token, std::move(item))) { sema->signal(); return true; } return false; } // Enqueues several items. // Does not allocate memory (except for one-time implicit producer). // Fails if not enough room to enqueue (or implicit production is // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). // Note: Use std::make_move_iterator if the elements should be moved // instead of copied. // Thread-safe. template inline bool try_enqueue_bulk(It itemFirst, size_t count) { if (inner.try_enqueue_bulk(std::forward(itemFirst), count)) { sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count); return true; } return false; } // Enqueues several items using an explicit producer token. // Does not allocate memory. Fails if not enough room to enqueue. // Note: Use std::make_move_iterator if the elements should be moved // instead of copied. // Thread-safe. template inline bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) { if (inner.try_enqueue_bulk(token, std::forward(itemFirst), count)) { sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count); return true; } return false; } // Attempts to dequeue from the queue. // Returns false if all producer streams appeared empty at the time they // were checked (so, the queue is likely but not guaranteed to be empty). // Never allocates. Thread-safe. template inline bool try_dequeue(U& item) { if (sema->tryWait()) { while (!inner.try_dequeue(item)) { continue; } return true; } return false; } // Attempts to dequeue from the queue using an explicit consumer token. // Returns false if all producer streams appeared empty at the time they // were checked (so, the queue is likely but not guaranteed to be empty). // Never allocates. Thread-safe. template inline bool try_dequeue(consumer_token_t& token, U& item) { if (sema->tryWait()) { while (!inner.try_dequeue(token, item)) { continue; } return true; } return false; } // Attempts to dequeue several elements from the queue. // Returns the number of items actually dequeued. // Returns 0 if all producer streams appeared empty at the time they // were checked (so, the queue is likely but not guaranteed to be empty). // Never allocates. Thread-safe. template inline size_t try_dequeue_bulk(It itemFirst, size_t max) { size_t count = 0; max = (size_t)sema->tryWaitMany((LightweightSemaphore::ssize_t)(ssize_t)max); while (count != max) { count += inner.template try_dequeue_bulk(itemFirst, max - count); } return count; } // Attempts to dequeue several elements from the queue using an explicit consumer token. // Returns the number of items actually dequeued. // Returns 0 if all producer streams appeared empty at the time they // were checked (so, the queue is likely but not guaranteed to be empty). // Never allocates. Thread-safe. template inline size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max) { size_t count = 0; max = (size_t)sema->tryWaitMany((LightweightSemaphore::ssize_t)(ssize_t)max); while (count != max) { count += inner.template try_dequeue_bulk(token, itemFirst, max - count); } return count; } // Blocks the current thread until there's something to dequeue, then // dequeues it. // Never allocates. Thread-safe. template inline void wait_dequeue(U& item) { while (!sema->wait()) { continue; } while (!inner.try_dequeue(item)) { continue; } } // Blocks the current thread until either there's something to dequeue // or the timeout (specified in microseconds) expires. Returns false // without setting `item` if the timeout expires, otherwise assigns // to `item` and returns true. // Using a negative timeout indicates an indefinite timeout, // and is thus functionally equivalent to calling wait_dequeue. // Never allocates. Thread-safe. template inline bool wait_dequeue_timed(U& item, std::int64_t timeout_usecs) { if (!sema->wait(timeout_usecs)) { return false; } while (!inner.try_dequeue(item)) { continue; } return true; } // Blocks the current thread until either there's something to dequeue // or the timeout expires. Returns false without setting `item` if the // timeout expires, otherwise assigns to `item` and returns true. // Never allocates. Thread-safe. template inline bool wait_dequeue_timed(U& item, std::chrono::duration const& timeout) { return wait_dequeue_timed(item, std::chrono::duration_cast(timeout).count()); } // Blocks the current thread until there's something to dequeue, then // dequeues it using an explicit consumer token. // Never allocates. Thread-safe. template inline void wait_dequeue(consumer_token_t& token, U& item) { while (!sema->wait()) { continue; } while (!inner.try_dequeue(token, item)) { continue; } } // Blocks the current thread until either there's something to dequeue // or the timeout (specified in microseconds) expires. Returns false // without setting `item` if the timeout expires, otherwise assigns // to `item` and returns true. // Using a negative timeout indicates an indefinite timeout, // and is thus functionally equivalent to calling wait_dequeue. // Never allocates. Thread-safe. template inline bool wait_dequeue_timed(consumer_token_t& token, U& item, std::int64_t timeout_usecs) { if (!sema->wait(timeout_usecs)) { return false; } while (!inner.try_dequeue(token, item)) { continue; } return true; } // Blocks the current thread until either there's something to dequeue // or the timeout expires. Returns false without setting `item` if the // timeout expires, otherwise assigns to `item` and returns true. // Never allocates. Thread-safe. template inline bool wait_dequeue_timed(consumer_token_t& token, U& item, std::chrono::duration const& timeout) { return wait_dequeue_timed(token, item, std::chrono::duration_cast(timeout).count()); } // Attempts to dequeue several elements from the queue. // Returns the number of items actually dequeued, which will // always be at least one (this method blocks until the queue // is non-empty) and at most max. // Never allocates. Thread-safe. template inline size_t wait_dequeue_bulk(It itemFirst, size_t max) { size_t count = 0; max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max); while (count != max) { count += inner.template try_dequeue_bulk(itemFirst, max - count); } return count; } // Attempts to dequeue several elements from the queue. // Returns the number of items actually dequeued, which can // be 0 if the timeout expires while waiting for elements, // and at most max. // Using a negative timeout indicates an indefinite timeout, // and is thus functionally equivalent to calling wait_dequeue_bulk. // Never allocates. Thread-safe. template inline size_t wait_dequeue_bulk_timed(It itemFirst, size_t max, std::int64_t timeout_usecs) { size_t count = 0; max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max, timeout_usecs); while (count != max) { count += inner.template try_dequeue_bulk(itemFirst, max - count); } return count; } // Attempts to dequeue several elements from the queue. // Returns the number of items actually dequeued, which can // be 0 if the timeout expires while waiting for elements, // and at most max. // Never allocates. Thread-safe. template inline size_t wait_dequeue_bulk_timed(It itemFirst, size_t max, std::chrono::duration const& timeout) { return wait_dequeue_bulk_timed(itemFirst, max, std::chrono::duration_cast(timeout).count()); } // Attempts to dequeue several elements from the queue using an explicit consumer token. // Returns the number of items actually dequeued, which will // always be at least one (this method blocks until the queue // is non-empty) and at most max. // Never allocates. Thread-safe. template inline size_t wait_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max) { size_t count = 0; max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max); while (count != max) { count += inner.template try_dequeue_bulk(token, itemFirst, max - count); } return count; } // Attempts to dequeue several elements from the queue using an explicit consumer token. // Returns the number of items actually dequeued, which can // be 0 if the timeout expires while waiting for elements, // and at most max. // Using a negative timeout indicates an indefinite timeout, // and is thus functionally equivalent to calling wait_dequeue_bulk. // Never allocates. Thread-safe. template inline size_t wait_dequeue_bulk_timed(consumer_token_t& token, It itemFirst, size_t max, std::int64_t timeout_usecs) { size_t count = 0; max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max, timeout_usecs); while (count != max) { count += inner.template try_dequeue_bulk(token, itemFirst, max - count); } return count; } // Attempts to dequeue several elements from the queue using an explicit consumer token. // Returns the number of items actually dequeued, which can // be 0 if the timeout expires while waiting for elements, // and at most max. // Never allocates. Thread-safe. template inline size_t wait_dequeue_bulk_timed(consumer_token_t& token, It itemFirst, size_t max, std::chrono::duration const& timeout) { return wait_dequeue_bulk_timed(token, itemFirst, max, std::chrono::duration_cast(timeout).count()); } // Returns an estimate of the total number of elements currently in the queue. This // estimate is only accurate if the queue has completely stabilized before it is called // (i.e. all enqueue and dequeue operations have completed and their memory effects are // visible on the calling thread, and no further operations start while this method is // being called). // Thread-safe. inline size_t size_approx() const { return (size_t)sema->availableApprox(); } // Returns true if the underlying atomic variables used by // the queue are lock-free (they should be on most platforms). // Thread-safe. static constexpr bool is_lock_free() { return ConcurrentQueue::is_lock_free(); } private: template static inline U* create(A1&& a1, A2&& a2) { void* p = (Traits::malloc)(sizeof(U)); return p != nullptr ? new (p) U(std::forward(a1), std::forward(a2)) : nullptr; } template static inline void destroy(U* p) { if (p != nullptr) { p->~U(); } (Traits::free)(p); } private: ConcurrentQueue inner; std::unique_ptr sema; }; template inline void swap(BlockingConcurrentQueue& a, BlockingConcurrentQueue& b) MOODYCAMEL_NOEXCEPT { a.swap(b); } } // end namespace moodycamel ================================================ FILE: dispenso/third-party/moodycamel/concurrentqueue.h ================================================ // Provides a C++11 implementation of a multi-producer, multi-consumer lock-free queue. // An overview, including benchmark results, is provided here: // http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++ // The full design is also described in excruciating detail at: // http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue // Simplified BSD license: // Copyright (c) 2013-2020, Cameron Desrochers. // All rights reserved. // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: // // - Redistributions of source code must retain the above copyright notice, this list of // conditions and the following disclaimer. // - Redistributions in binary form must reproduce the above copyright notice, this list of // conditions and the following disclaimer in the documentation and/or other materials // provided with the distribution. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL // THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT // OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR // TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, // EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // Also dual-licensed under the Boost Software License (see LICENSE.md) #pragma once #if defined(__GNUC__) && !defined(__INTEL_COMPILER) // Disable -Wconversion warnings (spuriously triggered when Traits::size_t and // Traits::index_t are set to < 32 bits, causing integer promotion, causing warnings // upon assigning any computed values) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wconversion" #if defined(__clang__) #pragma GCC diagnostic ignored "-Wglobal-constructors" #endif //__clang__ #ifdef MCDBGQ_USE_RELACY #pragma GCC diagnostic ignored "-Wint-to-pointer-cast" #endif #endif #if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17) // VS2019 with /W4 warns about constant conditional expressions but unless /std=c++17 or higher // does not support `if constexpr`, so we have no choice but to simply disable the warning #pragma warning(push) #pragma warning(disable: 4127) // conditional expression is constant #endif #if defined(__APPLE__) #include "TargetConditionals.h" #endif #ifdef MCDBGQ_USE_RELACY #include "relacy/relacy_std.hpp" #include "relacy_shims.h" // We only use malloc/free anyway, and the delete macro messes up `= delete` method declarations. // We'll override the default trait malloc ourselves without a macro. #undef new #undef delete #undef malloc #undef free #else #include // Requires C++11. Sorry VS2010. #include #endif #include // for max_align_t #include #include #include #include #include #include #include // for CHAR_BIT #include #include // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading #include // used for thread exit synchronization // Platform-specific definitions of a numeric thread ID type and an invalid value namespace moodycamel { namespace details { template struct thread_id_converter { typedef thread_id_t thread_id_numeric_size_t; typedef thread_id_t thread_id_hash_t; static thread_id_hash_t prehash(thread_id_t const& x) { return x; } }; } } #if defined(MCDBGQ_USE_RELACY) namespace moodycamel { namespace details { typedef std::uint32_t thread_id_t; static const thread_id_t invalid_thread_id = 0xFFFFFFFFU; static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU; static inline thread_id_t thread_id() { return rl::thread_index(); } } } #elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__) // No sense pulling in windows.h in a header, we'll manually declare the function // we use and rely on backwards-compatibility for this not to break extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void); namespace moodycamel { namespace details { static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows"); typedef std::uint32_t thread_id_t; static const thread_id_t invalid_thread_id = 0; // See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx static const thread_id_t invalid_thread_id2 = 0xFFFFFFFFU; // Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread IDs are presently multiples of 4. static inline thread_id_t thread_id() { return static_cast(::GetCurrentThreadId()); } } } #elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || (defined(__APPLE__) && TARGET_OS_IPHONE) || defined(MOODYCAMEL_NO_THREAD_LOCAL) namespace moodycamel { namespace details { static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, "std::thread::id is expected to be either 4 or 8 bytes"); typedef std::thread::id thread_id_t; static const thread_id_t invalid_thread_id; // Default ctor creates invalid ID // Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's // only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't // be. static inline thread_id_t thread_id() { return std::this_thread::get_id(); } template struct thread_id_size { }; template<> struct thread_id_size<4> { typedef std::uint32_t numeric_t; }; template<> struct thread_id_size<8> { typedef std::uint64_t numeric_t; }; template<> struct thread_id_converter { typedef thread_id_size::numeric_t thread_id_numeric_size_t; #ifndef __APPLE__ typedef std::size_t thread_id_hash_t; #else typedef thread_id_numeric_size_t thread_id_hash_t; #endif static thread_id_hash_t prehash(thread_id_t const& x) { #ifndef __APPLE__ return std::hash()(x); #else return *reinterpret_cast(&x); #endif } }; } } #else // Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475 // In order to get a numeric thread ID in a platform-independent way, we use a thread-local // static variable's address as a thread identifier :-) #if defined(__GNUC__) || defined(__INTEL_COMPILER) #define MOODYCAMEL_THREADLOCAL __thread #elif defined(_MSC_VER) #define MOODYCAMEL_THREADLOCAL __declspec(thread) #else // Assume C++11 compliant compiler #define MOODYCAMEL_THREADLOCAL thread_local #endif namespace moodycamel { namespace details { typedef std::uintptr_t thread_id_t; static const thread_id_t invalid_thread_id = 0; // Address can't be nullptr static const thread_id_t invalid_thread_id2 = 1; // Member accesses off a null pointer are also generally invalid. Plus it's not aligned. inline thread_id_t thread_id() { static MOODYCAMEL_THREADLOCAL int x; return reinterpret_cast(&x); } } } #endif // Constexpr if #ifndef MOODYCAMEL_CONSTEXPR_IF #if (defined(_MSC_VER) && defined(_HAS_CXX17) && _HAS_CXX17) || __cplusplus > 201402L #define MOODYCAMEL_CONSTEXPR_IF if constexpr #define MOODYCAMEL_MAYBE_UNUSED [[maybe_unused]] #else #define MOODYCAMEL_CONSTEXPR_IF if #define MOODYCAMEL_MAYBE_UNUSED #endif #endif // Exceptions #ifndef MOODYCAMEL_EXCEPTIONS_ENABLED #if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__)) #define MOODYCAMEL_EXCEPTIONS_ENABLED #endif #endif #ifdef MOODYCAMEL_EXCEPTIONS_ENABLED #define MOODYCAMEL_TRY try #define MOODYCAMEL_CATCH(...) catch(__VA_ARGS__) #define MOODYCAMEL_RETHROW throw #define MOODYCAMEL_THROW(expr) throw (expr) #else #define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF (true) #define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF (false) #define MOODYCAMEL_RETHROW #define MOODYCAMEL_THROW(expr) #endif #ifndef MOODYCAMEL_NOEXCEPT #if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED) #define MOODYCAMEL_NOEXCEPT #define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true #define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true #elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800 // VS2012's std::is_nothrow_[move_]constructible is broken and returns true when it shouldn't :-( // We have to assume *all* non-trivial constructors may throw on VS2012! #define MOODYCAMEL_NOEXCEPT _NOEXCEPT #define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference::value && std::is_move_constructible::value ? std::is_trivially_move_constructible::value : std::is_trivially_copy_constructible::value) #define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference::value && std::is_move_assignable::value ? std::is_trivially_move_assignable::value || std::is_nothrow_move_assignable::value : std::is_trivially_copy_assignable::value || std::is_nothrow_copy_assignable::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) #elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900 #define MOODYCAMEL_NOEXCEPT _NOEXCEPT #define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference::value && std::is_move_constructible::value ? std::is_trivially_move_constructible::value || std::is_nothrow_move_constructible::value : std::is_trivially_copy_constructible::value || std::is_nothrow_copy_constructible::value) #define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference::value && std::is_move_assignable::value ? std::is_trivially_move_assignable::value || std::is_nothrow_move_assignable::value : std::is_trivially_copy_assignable::value || std::is_nothrow_copy_assignable::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) #else #define MOODYCAMEL_NOEXCEPT noexcept #define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr) #define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr) #endif #endif #ifndef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED #ifdef MCDBGQ_USE_RELACY #define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED #else // VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445 // g++ <=4.7 doesn't support thread_local either. // Finally, iOS/ARM doesn't have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work #if (!defined(_MSC_VER) || _MSC_VER >= 1900) && (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) && (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) // Assume `thread_local` is fully supported in all other C++11 compilers/platforms #define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED // tentatively enabled for now; years ago several users report having problems with it on #endif #endif #endif // VS2012 doesn't support deleted functions. // In this case, we declare the function normally but don't define it. A link error will be generated if the function is called. #ifndef MOODYCAMEL_DELETE_FUNCTION #if defined(_MSC_VER) && _MSC_VER < 1800 #define MOODYCAMEL_DELETE_FUNCTION #else #define MOODYCAMEL_DELETE_FUNCTION = delete #endif #endif namespace moodycamel { namespace details { #ifndef MOODYCAMEL_ALIGNAS // VS2013 doesn't support alignas or alignof, and align() requires a constant literal #if defined(_MSC_VER) && _MSC_VER <= 1800 #define MOODYCAMEL_ALIGNAS(alignment) __declspec(align(alignment)) #define MOODYCAMEL_ALIGNOF(obj) __alignof(obj) #define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) typename details::Vs2013Aligned::value, T>::type template struct Vs2013Aligned { }; // default, unsupported alignment template struct Vs2013Aligned<1, T> { typedef __declspec(align(1)) T type; }; template struct Vs2013Aligned<2, T> { typedef __declspec(align(2)) T type; }; template struct Vs2013Aligned<4, T> { typedef __declspec(align(4)) T type; }; template struct Vs2013Aligned<8, T> { typedef __declspec(align(8)) T type; }; template struct Vs2013Aligned<16, T> { typedef __declspec(align(16)) T type; }; template struct Vs2013Aligned<32, T> { typedef __declspec(align(32)) T type; }; template struct Vs2013Aligned<64, T> { typedef __declspec(align(64)) T type; }; template struct Vs2013Aligned<128, T> { typedef __declspec(align(128)) T type; }; template struct Vs2013Aligned<256, T> { typedef __declspec(align(256)) T type; }; #else template struct identity { typedef T type; }; #define MOODYCAMEL_ALIGNAS(alignment) alignas(alignment) #define MOODYCAMEL_ALIGNOF(obj) alignof(obj) #define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) alignas(alignof(obj)) typename details::identity::type #endif #endif } } // TSAN can false report races in lock-free code. To enable TSAN to be used from projects that use this one, // we can apply per-function compile-time suppression. // See https://clang.llvm.org/docs/ThreadSanitizer.html#has-feature-thread-sanitizer #define MOODYCAMEL_NO_TSAN #if defined(__has_feature) #if __has_feature(thread_sanitizer) #undef MOODYCAMEL_NO_TSAN #define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread"))) #endif // TSAN #endif // TSAN // Compiler-specific likely/unlikely hints namespace moodycamel { namespace details { #if defined(__GNUC__) static inline bool (likely)(bool x) { return __builtin_expect((x), true); } static inline bool (unlikely)(bool x) { return __builtin_expect((x), false); } #else static inline bool (likely)(bool x) { return x; } static inline bool (unlikely)(bool x) { return x; } #endif } } #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG #include "internal/concurrentqueue_internal_debug.h" #endif namespace moodycamel { namespace details { template struct const_numeric_max { static_assert(std::is_integral::value, "const_numeric_max can only be used with integers"); static const T value = std::numeric_limits::is_signed ? (static_cast(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast(1) : static_cast(-1); }; #if defined(__GLIBCXX__) typedef ::max_align_t std_max_align_t; // libstdc++ forgot to add it to std:: for a while #else typedef std::max_align_t std_max_align_t; // Others (e.g. MSVC) insist it can *only* be accessed via std:: #endif // Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting // 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64. typedef union { std_max_align_t x; long long y; void* z; } max_align_t; } // Default traits for the ConcurrentQueue. To change some of the // traits without re-implementing all of them, inherit from this // struct and shadow the declarations you wish to be different; // since the traits are used as a template type parameter, the // shadowed declarations will be used where defined, and the defaults // otherwise. struct ConcurrentQueueDefaultTraits { // General-purpose size type. std::size_t is strongly recommended. typedef std::size_t size_t; // The type used for the enqueue and dequeue indices. Must be at least as // large as size_t. Should be significantly larger than the number of elements // you expect to hold at once, especially if you have a high turnover rate; // for example, on 32-bit x86, if you expect to have over a hundred million // elements or pump several million elements through your queue in a very // short space of time, using a 32-bit type *may* trigger a race condition. // A 64-bit int type is recommended in that case, and in practice will // prevent a race condition no matter the usage of the queue. Note that // whether the queue is lock-free with a 64-int type depends on the whether // std::atomic is lock-free, which is platform-specific. typedef std::size_t index_t; // Internally, all elements are enqueued and dequeued from multi-element // blocks; this is the smallest controllable unit. If you expect few elements // but many producers, a smaller block size should be favoured. For few producers // and/or many elements, a larger block size is preferred. A sane default // is provided. Must be a power of 2. static const size_t BLOCK_SIZE = 32; // For explicit producers (i.e. when using a producer token), the block is // checked for being empty by iterating through a list of flags, one per element. // For large block sizes, this is too inefficient, and switching to an atomic // counter-based approach is faster. The switch is made for block sizes strictly // larger than this threshold. static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32; // How many full blocks can be expected for a single explicit producer? This should // reflect that number's maximum for optimal performance. Must be a power of 2. static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32; // How many full blocks can be expected for a single implicit producer? This should // reflect that number's maximum for optimal performance. Must be a power of 2. static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32; // The initial size of the hash table mapping thread IDs to implicit producers. // Note that the hash is resized every time it becomes half full. // Must be a power of two, and either 0 or at least 1. If 0, implicit production // (using the enqueue methods without an explicit producer token) is disabled. static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32; // Controls the number of items that an explicit consumer (i.e. one with a token) // must consume before it causes all consumers to rotate and move on to the next // internal queue. static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256; // The maximum number of elements (inclusive) that can be enqueued to a sub-queue. // Enqueue operations that would cause this limit to be surpassed will fail. Note // that this limit is enforced at the block level (for performance reasons), i.e. // it's rounded up to the nearest block size. static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max::value; // The number of times to spin before sleeping when waiting on a semaphore. // Recommended values are on the order of 1000-10000 unless the number of // consumer threads exceeds the number of idle cores (in which case try 0-100). // Only affects instances of the BlockingConcurrentQueue. static const int MAX_SEMA_SPINS = 10000; // Whether to recycle dynamically-allocated blocks into an internal free list or // not. If false, only pre-allocated blocks (controlled by the constructor // arguments) will be recycled, and all others will be `free`d back to the heap. // Note that blocks consumed by explicit producers are only freed on destruction // of the queue (not following destruction of the token) regardless of this trait. static const bool RECYCLE_ALLOCATED_BLOCKS = false; #ifndef MCDBGQ_USE_RELACY // Memory allocation can be customized if needed. // malloc should return nullptr on failure, and handle alignment like std::malloc. #if defined(malloc) || defined(free) // Gah, this is 2015, stop defining macros that break standard code already! // Work around malloc/free being special macros: static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); } static inline void WORKAROUND_free(void* ptr) { return free(ptr); } static inline void* (malloc)(size_t size) { return WORKAROUND_malloc(size); } static inline void (free)(void* ptr) { return WORKAROUND_free(ptr); } #else static inline void* malloc(size_t size) { return std::malloc(size); } static inline void free(void* ptr) { return std::free(ptr); } #endif #else // Debug versions when running under the Relacy race detector (ignore // these in user code) static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); } static inline void free(void* ptr) { return rl::rl_free(ptr, $); } #endif }; // When producing or consuming many elements, the most efficient way is to: // 1) Use one of the bulk-operation methods of the queue with a token // 2) Failing that, use the bulk-operation methods without a token // 3) Failing that, create a token and use that with the single-item methods // 4) Failing that, use the single-parameter methods of the queue // Having said that, don't create tokens willy-nilly -- ideally there should be // a maximum of one token per thread (of each kind). struct ProducerToken; struct ConsumerToken; template class ConcurrentQueue; template class BlockingConcurrentQueue; class ConcurrentQueueTests; namespace details { struct ConcurrentQueueProducerTypelessBase { ConcurrentQueueProducerTypelessBase* next; std::atomic inactive; ProducerToken* token; ConcurrentQueueProducerTypelessBase() : next(nullptr), inactive(false), token(nullptr) { } }; template struct _hash_32_or_64 { static inline std::uint32_t hash(std::uint32_t h) { // MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp // Since the thread ID is already unique, all we really want to do is propagate that // uniqueness evenly across all the bits, so that we can use a subset of the bits while // reducing collisions significantly h ^= h >> 16; h *= 0x85ebca6b; h ^= h >> 13; h *= 0xc2b2ae35; return h ^ (h >> 16); } }; template<> struct _hash_32_or_64<1> { static inline std::uint64_t hash(std::uint64_t h) { h ^= h >> 33; h *= 0xff51afd7ed558ccd; h ^= h >> 33; h *= 0xc4ceb9fe1a85ec53; return h ^ (h >> 33); } }; template struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> { }; static inline size_t hash_thread_id(thread_id_t id) { static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values"); return static_cast(hash_32_or_64::thread_id_hash_t)>::hash( thread_id_converter::prehash(id))); } template static inline bool circular_less_than(T a, T b) { static_assert(std::is_integral::value && !std::numeric_limits::is_signed, "circular_less_than is intended to be used only with unsigned integer types"); return static_cast(a - b) > static_cast(static_cast(1) << (static_cast(sizeof(T) * CHAR_BIT - 1))); // Note: extra parens around rhs of operator<< is MSVC bug: https://developercommunity2.visualstudio.com/t/C4554-triggers-when-both-lhs-and-rhs-is/10034931 // silencing the bug requires #pragma warning(disable: 4554) around the calling code and has no effect when done here. } template static inline char* align_for(char* ptr) { const std::size_t alignment = std::alignment_of::value; return ptr + (alignment - (reinterpret_cast(ptr) % alignment)) % alignment; } template static inline T ceil_to_pow_2(T x) { static_assert(std::is_integral::value && !std::numeric_limits::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types"); // Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 --x; x |= x >> 1; x |= x >> 2; x |= x >> 4; for (std::size_t i = 1; i < sizeof(T); i <<= 1) { x |= x >> (i << 3); } ++x; return x; } template static inline void swap_relaxed(std::atomic& left, std::atomic& right) { T temp = std::move(left.load(std::memory_order_relaxed)); left.store(std::move(right.load(std::memory_order_relaxed)), std::memory_order_relaxed); right.store(std::move(temp), std::memory_order_relaxed); } template static inline T const& nomove(T const& x) { return x; } template struct nomove_if { template static inline T const& eval(T const& x) { return x; } }; template<> struct nomove_if { template static inline auto eval(U&& x) -> decltype(std::forward(x)) { return std::forward(x); } }; template static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it) { return *it; } #if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) template struct is_trivially_destructible : std::is_trivially_destructible { }; #else template struct is_trivially_destructible : std::has_trivial_destructor { }; #endif #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED #ifdef MCDBGQ_USE_RELACY typedef RelacyThreadExitListener ThreadExitListener; typedef RelacyThreadExitNotifier ThreadExitNotifier; #else class ThreadExitNotifier; struct ThreadExitListener { typedef void (*callback_t)(void*); callback_t callback; void* userData; ThreadExitListener* next; // reserved for use by the ThreadExitNotifier ThreadExitNotifier* chain; // reserved for use by the ThreadExitNotifier }; class ThreadExitNotifier { public: static void subscribe(ThreadExitListener* listener) { auto& tlsInst = instance(); std::lock_guard guard(mutex()); listener->next = tlsInst.tail; listener->chain = &tlsInst; tlsInst.tail = listener; } static void unsubscribe(ThreadExitListener* listener) { std::lock_guard guard(mutex()); if (!listener->chain) { return; // race with ~ThreadExitNotifier } auto& tlsInst = *listener->chain; listener->chain = nullptr; ThreadExitListener** prev = &tlsInst.tail; for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) { if (ptr == listener) { *prev = ptr->next; break; } prev = &ptr->next; } } private: ThreadExitNotifier() : tail(nullptr) { } ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION; ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION; ~ThreadExitNotifier() { // This thread is about to exit, let everyone know! assert(this == &instance() && "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined."); std::lock_guard guard(mutex()); for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) { ptr->chain = nullptr; ptr->callback(ptr->userData); } } // Thread-local static inline ThreadExitNotifier& instance() { static thread_local ThreadExitNotifier notifier; return notifier; } static inline std::mutex& mutex() { // Must be static because the ThreadExitNotifier could be destroyed while unsubscribe is called static std::mutex mutex; return mutex; } private: ThreadExitListener* tail; }; #endif #endif template struct static_is_lock_free_num { enum { value = 0 }; }; template<> struct static_is_lock_free_num { enum { value = ATOMIC_CHAR_LOCK_FREE }; }; template<> struct static_is_lock_free_num { enum { value = ATOMIC_SHORT_LOCK_FREE }; }; template<> struct static_is_lock_free_num { enum { value = ATOMIC_INT_LOCK_FREE }; }; template<> struct static_is_lock_free_num { enum { value = ATOMIC_LONG_LOCK_FREE }; }; template<> struct static_is_lock_free_num { enum { value = ATOMIC_LLONG_LOCK_FREE }; }; template struct static_is_lock_free : static_is_lock_free_num::type> { }; template<> struct static_is_lock_free { enum { value = ATOMIC_BOOL_LOCK_FREE }; }; template struct static_is_lock_free { enum { value = ATOMIC_POINTER_LOCK_FREE }; }; } struct ProducerToken { template explicit ProducerToken(ConcurrentQueue& queue); template explicit ProducerToken(BlockingConcurrentQueue& queue); ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT : producer(other.producer) { other.producer = nullptr; if (producer != nullptr) { producer->token = this; } } inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT { swap(other); return *this; } void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT { std::swap(producer, other.producer); if (producer != nullptr) { producer->token = this; } if (other.producer != nullptr) { other.producer->token = &other; } } // A token is always valid unless: // 1) Memory allocation failed during construction // 2) It was moved via the move constructor // (Note: assignment does a swap, leaving both potentially valid) // 3) The associated queue was destroyed // Note that if valid() returns true, that only indicates // that the token is valid for use with a specific queue, // but not which one; that's up to the user to track. inline bool valid() const { return producer != nullptr; } ~ProducerToken() { if (producer != nullptr) { producer->token = nullptr; producer->inactive.store(true, std::memory_order_release); } } // Disable copying and assignment ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION; ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION; private: template friend class ConcurrentQueue; friend class ConcurrentQueueTests; protected: details::ConcurrentQueueProducerTypelessBase* producer; }; struct ConsumerToken { template explicit ConsumerToken(ConcurrentQueue& q); template explicit ConsumerToken(BlockingConcurrentQueue& q); ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT : initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset), itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer), desiredProducer(other.desiredProducer) { } inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT { swap(other); return *this; } void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT { std::swap(initialOffset, other.initialOffset); std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset); std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent); std::swap(currentProducer, other.currentProducer); std::swap(desiredProducer, other.desiredProducer); } // Disable copying and assignment ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION; ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION; private: template friend class ConcurrentQueue; friend class ConcurrentQueueTests; private: // but shared with ConcurrentQueue std::uint32_t initialOffset; std::uint32_t lastKnownGlobalOffset; std::uint32_t itemsConsumedFromCurrent; details::ConcurrentQueueProducerTypelessBase* currentProducer; details::ConcurrentQueueProducerTypelessBase* desiredProducer; }; // Need to forward-declare this swap because it's in a namespace. // See http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces template inline void swap(typename ConcurrentQueue::ImplicitProducerKVP& a, typename ConcurrentQueue::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT; template class ConcurrentQueue { public: typedef ::moodycamel::ProducerToken producer_token_t; typedef ::moodycamel::ConsumerToken consumer_token_t; typedef typename Traits::index_t index_t; typedef typename Traits::size_t size_t; static const size_t BLOCK_SIZE = static_cast(Traits::BLOCK_SIZE); static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD); static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast(Traits::EXPLICIT_INITIAL_INDEX_SIZE); static const size_t IMPLICIT_INITIAL_INDEX_SIZE = static_cast(Traits::IMPLICIT_INITIAL_INDEX_SIZE); static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = static_cast(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE); static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE); #ifdef _MSC_VER #pragma warning(push) #pragma warning(disable: 4307) // + integral constant overflow (that's what the ternary expression is for!) #pragma warning(disable: 4309) // static_cast: Truncation of constant value #endif static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max::value - static_cast(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max::value : ((static_cast(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE); #ifdef _MSC_VER #pragma warning(pop) #endif static_assert(!std::numeric_limits::is_signed && std::is_integral::value, "Traits::size_t must be an unsigned integral type"); static_assert(!std::numeric_limits::is_signed && std::is_integral::value, "Traits::index_t must be an unsigned integral type"); static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t"); static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)"); static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)"); static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)"); static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)"); static_assert((INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2"); static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)"); public: // Creates a queue with at least `capacity` element slots; note that the // actual number of elements that can be inserted without additional memory // allocation depends on the number of producers and the block size (e.g. if // the block size is equal to `capacity`, only a single block will be allocated // up-front, which means only a single producer will be able to enqueue elements // without an extra allocation -- blocks aren't shared between producers). // This method is not thread safe -- it is up to the user to ensure that the // queue is fully constructed before it starts being used by other threads (this // includes making the memory effects of construction visible, possibly with a // memory barrier). explicit ConcurrentQueue(size_t capacity = 32 * BLOCK_SIZE) : producerListTail(nullptr), producerCount(0), initialBlockPoolIndex(0), nextExplicitConsumerId(0), globalExplicitConsumerOffset(0) { implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); populate_initial_implicit_producer_hash(); populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1)); #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG // Track all the producers using a fully-resolved typed list for // each kind; this makes it possible to debug them starting from // the root queue object (otherwise wacky casts are needed that // don't compile in the debugger's expression evaluator). explicitProducers.store(nullptr, std::memory_order_relaxed); implicitProducers.store(nullptr, std::memory_order_relaxed); #endif } // Computes the correct amount of pre-allocated blocks for you based // on the minimum number of elements you want available at any given // time, and the maximum concurrent number of each type of producer. ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers) : producerListTail(nullptr), producerCount(0), initialBlockPoolIndex(0), nextExplicitConsumerId(0), globalExplicitConsumerOffset(0) { implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); populate_initial_implicit_producer_hash(); size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers + maxImplicitProducers); populate_initial_block_list(blocks); #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG explicitProducers.store(nullptr, std::memory_order_relaxed); implicitProducers.store(nullptr, std::memory_order_relaxed); #endif } // Note: The queue should not be accessed concurrently while it's // being deleted. It's up to the user to synchronize this. // This method is not thread safe. ~ConcurrentQueue() { // Destroy producers auto ptr = producerListTail.load(std::memory_order_relaxed); while (ptr != nullptr) { auto next = ptr->next_prod(); if (ptr->token != nullptr) { ptr->token->producer = nullptr; } destroy(ptr); ptr = next; } // Destroy implicit producer hash tables MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) { auto hash = implicitProducerHash.load(std::memory_order_relaxed); while (hash != nullptr) { auto prev = hash->prev; if (prev != nullptr) { // The last hash is part of this object and was not allocated dynamically for (size_t i = 0; i != hash->capacity; ++i) { hash->entries[i].~ImplicitProducerKVP(); } hash->~ImplicitProducerHash(); (Traits::free)(hash); } hash = prev; } } // Destroy global free list auto block = freeList.head_unsafe(); while (block != nullptr) { auto next = block->freeListNext.load(std::memory_order_relaxed); if (block->dynamicallyAllocated) { destroy(block); } block = next; } // Destroy initial free list destroy_array(initialBlockPool, initialBlockPoolSize); } // Disable copying and copy assignment ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; // Moving is supported, but note that it is *not* a thread-safe operation. // Nobody can use the queue while it's being moved, and the memory effects // of that move must be propagated to other threads before they can use it. // Note: When a queue is moved, its tokens are still valid but can only be // used with the destination queue (i.e. semantically they are moved along // with the queue itself). ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT : producerListTail(other.producerListTail.load(std::memory_order_relaxed)), producerCount(other.producerCount.load(std::memory_order_relaxed)), initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)), initialBlockPool(other.initialBlockPool), initialBlockPoolSize(other.initialBlockPoolSize), freeList(std::move(other.freeList)), nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)), globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed)) { // Move the other one into this, and leave the other one as an empty queue implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); populate_initial_implicit_producer_hash(); swap_implicit_producer_hashes(other); other.producerListTail.store(nullptr, std::memory_order_relaxed); other.producerCount.store(0, std::memory_order_relaxed); other.nextExplicitConsumerId.store(0, std::memory_order_relaxed); other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed); #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG explicitProducers.store(other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed); other.explicitProducers.store(nullptr, std::memory_order_relaxed); implicitProducers.store(other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed); other.implicitProducers.store(nullptr, std::memory_order_relaxed); #endif other.initialBlockPoolIndex.store(0, std::memory_order_relaxed); other.initialBlockPoolSize = 0; other.initialBlockPool = nullptr; reown_producers(); } inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT { return swap_internal(other); } // Swaps this queue's state with the other's. Not thread-safe. // Swapping two queues does not invalidate their tokens, however // the tokens that were created for one queue must be used with // only the swapped queue (i.e. the tokens are tied to the // queue's movable state, not the object itself). inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT { swap_internal(other); } private: ConcurrentQueue& swap_internal(ConcurrentQueue& other) { if (this == &other) { return *this; } details::swap_relaxed(producerListTail, other.producerListTail); details::swap_relaxed(producerCount, other.producerCount); details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex); std::swap(initialBlockPool, other.initialBlockPool); std::swap(initialBlockPoolSize, other.initialBlockPoolSize); freeList.swap(other.freeList); details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId); details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset); swap_implicit_producer_hashes(other); reown_producers(); other.reown_producers(); #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG details::swap_relaxed(explicitProducers, other.explicitProducers); details::swap_relaxed(implicitProducers, other.implicitProducers); #endif return *this; } public: // Enqueues a single item (by copying it). // Allocates memory if required. Only fails if memory allocation fails (or implicit // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). // Thread-safe. inline bool enqueue(T const& item) { MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; else return inner_enqueue(item); } // Enqueues a single item (by moving it, if possible). // Allocates memory if required. Only fails if memory allocation fails (or implicit // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). // Thread-safe. inline bool enqueue(T&& item) { MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; else return inner_enqueue(std::move(item)); } // Enqueues a single item (by copying it) using an explicit producer token. // Allocates memory if required. Only fails if memory allocation fails (or // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). // Thread-safe. inline bool enqueue(producer_token_t const& token, T const& item) { return inner_enqueue(token, item); } // Enqueues a single item (by moving it, if possible) using an explicit producer token. // Allocates memory if required. Only fails if memory allocation fails (or // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). // Thread-safe. inline bool enqueue(producer_token_t const& token, T&& item) { return inner_enqueue(token, std::move(item)); } // Enqueues several items. // Allocates memory if required. Only fails if memory allocation fails (or // implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE // is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). // Note: Use std::make_move_iterator if the elements should be moved instead of copied. // Thread-safe. template bool enqueue_bulk(It itemFirst, size_t count) { MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; else return inner_enqueue_bulk(itemFirst, count); } // Enqueues several items using an explicit producer token. // Allocates memory if required. Only fails if memory allocation fails // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). // Note: Use std::make_move_iterator if the elements should be moved // instead of copied. // Thread-safe. template bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) { return inner_enqueue_bulk(token, itemFirst, count); } // Enqueues a single item (by copying it). // Does not allocate memory. Fails if not enough room to enqueue (or implicit // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE // is 0). // Thread-safe. inline bool try_enqueue(T const& item) { MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; else return inner_enqueue(item); } // Enqueues a single item (by moving it, if possible). // Does not allocate memory (except for one-time implicit producer). // Fails if not enough room to enqueue (or implicit production is // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). // Thread-safe. inline bool try_enqueue(T&& item) { MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; else return inner_enqueue(std::move(item)); } // Enqueues a single item (by copying it) using an explicit producer token. // Does not allocate memory. Fails if not enough room to enqueue. // Thread-safe. inline bool try_enqueue(producer_token_t const& token, T const& item) { return inner_enqueue(token, item); } // Enqueues a single item (by moving it, if possible) using an explicit producer token. // Does not allocate memory. Fails if not enough room to enqueue. // Thread-safe. inline bool try_enqueue(producer_token_t const& token, T&& item) { return inner_enqueue(token, std::move(item)); } // Enqueues several items. // Does not allocate memory (except for one-time implicit producer). // Fails if not enough room to enqueue (or implicit production is // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). // Note: Use std::make_move_iterator if the elements should be moved // instead of copied. // Thread-safe. template bool try_enqueue_bulk(It itemFirst, size_t count) { MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; else return inner_enqueue_bulk(itemFirst, count); } // Enqueues several items using an explicit producer token. // Does not allocate memory. Fails if not enough room to enqueue. // Note: Use std::make_move_iterator if the elements should be moved // instead of copied. // Thread-safe. template bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) { return inner_enqueue_bulk(token, itemFirst, count); } // Attempts to dequeue from the queue. // Returns false if all producer streams appeared empty at the time they // were checked (so, the queue is likely but not guaranteed to be empty). // Never allocates. Thread-safe. template bool try_dequeue(U& item) { // Instead of simply trying each producer in turn (which could cause needless contention on the first // producer), we score them heuristically. size_t nonEmptyCount = 0; ProducerBase* best = nullptr; size_t bestSize = 0; for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) { auto size = ptr->size_approx(); if (size > 0) { if (size > bestSize) { bestSize = size; best = ptr; } ++nonEmptyCount; } } // If there was at least one non-empty queue but it appears empty at the time // we try to dequeue from it, we need to make sure every queue's been tried if (nonEmptyCount > 0) { if ((details::likely)(best->dequeue(item))) { return true; } for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { if (ptr != best && ptr->dequeue(item)) { return true; } } } return false; } // Attempts to dequeue from the queue. // Returns false if all producer streams appeared empty at the time they // were checked (so, the queue is likely but not guaranteed to be empty). // This differs from the try_dequeue(item) method in that this one does // not attempt to reduce contention by interleaving the order that producer // streams are dequeued from. So, using this method can reduce overall throughput // under contention, but will give more predictable results in single-threaded // consumer scenarios. This is mostly only useful for internal unit tests. // Never allocates. Thread-safe. template bool try_dequeue_non_interleaved(U& item) { for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { if (ptr->dequeue(item)) { return true; } } return false; } // Attempts to dequeue from the queue using an explicit consumer token. // Returns false if all producer streams appeared empty at the time they // were checked (so, the queue is likely but not guaranteed to be empty). // Never allocates. Thread-safe. template bool try_dequeue(consumer_token_t& token, U& item) { // The idea is roughly as follows: // Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the highest efficiency consumer dictates the rotation speed of everyone else, more or less // If you see that the global offset has changed, you must reset your consumption counter and move to your designated place // If there's no items where you're supposed to be, keep moving until you find a producer with some items // If the global offset has not changed but you've run out of items to consume, move over from your current position until you find an producer with something in it if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) { if (!update_current_producer_after_rotation(token)) { return false; } } // If there was at least one non-empty queue but it appears empty at the time // we try to dequeue from it, we need to make sure every queue's been tried if (static_cast(token.currentProducer)->dequeue(item)) { if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed); } return true; } auto tail = producerListTail.load(std::memory_order_acquire); auto ptr = static_cast(token.currentProducer)->next_prod(); if (ptr == nullptr) { ptr = tail; } while (ptr != static_cast(token.currentProducer)) { if (ptr->dequeue(item)) { token.currentProducer = ptr; token.itemsConsumedFromCurrent = 1; return true; } ptr = ptr->next_prod(); if (ptr == nullptr) { ptr = tail; } } return false; } // Attempts to dequeue several elements from the queue. // Returns the number of items actually dequeued. // Returns 0 if all producer streams appeared empty at the time they // were checked (so, the queue is likely but not guaranteed to be empty). // Never allocates. Thread-safe. template size_t try_dequeue_bulk(It itemFirst, size_t max) { size_t count = 0; for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { count += ptr->dequeue_bulk(itemFirst, max - count); if (count == max) { break; } } return count; } // Attempts to dequeue several elements from the queue using an explicit consumer token. // Returns the number of items actually dequeued. // Returns 0 if all producer streams appeared empty at the time they // were checked (so, the queue is likely but not guaranteed to be empty). // Never allocates. Thread-safe. template size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max) { if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) { if (!update_current_producer_after_rotation(token)) { return 0; } } size_t count = static_cast(token.currentProducer)->dequeue_bulk(itemFirst, max); if (count == max) { if ((token.itemsConsumedFromCurrent += static_cast(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed); } return max; } token.itemsConsumedFromCurrent += static_cast(count); max -= count; auto tail = producerListTail.load(std::memory_order_acquire); auto ptr = static_cast(token.currentProducer)->next_prod(); if (ptr == nullptr) { ptr = tail; } while (ptr != static_cast(token.currentProducer)) { auto dequeued = ptr->dequeue_bulk(itemFirst, max); count += dequeued; if (dequeued != 0) { token.currentProducer = ptr; token.itemsConsumedFromCurrent = static_cast(dequeued); } if (dequeued == max) { break; } max -= dequeued; ptr = ptr->next_prod(); if (ptr == nullptr) { ptr = tail; } } return count; } // Attempts to dequeue from a specific producer's inner queue. // If you happen to know which producer you want to dequeue from, this // is significantly faster than using the general-case try_dequeue methods. // Returns false if the producer's queue appeared empty at the time it // was checked (so, the queue is likely but not guaranteed to be empty). // Never allocates. Thread-safe. template inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item) { return static_cast(producer.producer)->dequeue(item); } // Attempts to dequeue several elements from a specific producer's inner queue. // Returns the number of items actually dequeued. // If you happen to know which producer you want to dequeue from, this // is significantly faster than using the general-case try_dequeue methods. // Returns 0 if the producer's queue appeared empty at the time it // was checked (so, the queue is likely but not guaranteed to be empty). // Never allocates. Thread-safe. template inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max) { return static_cast(producer.producer)->dequeue_bulk(itemFirst, max); } // Returns an estimate of the total number of elements currently in the queue. This // estimate is only accurate if the queue has completely stabilized before it is called // (i.e. all enqueue and dequeue operations have completed and their memory effects are // visible on the calling thread, and no further operations start while this method is // being called). // Thread-safe. size_t size_approx() const { size_t size = 0; for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { size += ptr->size_approx(); } return size; } // Returns true if the underlying atomic variables used by // the queue are lock-free (they should be on most platforms). // Thread-safe. static constexpr bool is_lock_free() { return details::static_is_lock_free::value == 2 && details::static_is_lock_free::value == 2 && details::static_is_lock_free::value == 2 && details::static_is_lock_free::value == 2 && details::static_is_lock_free::value == 2 && details::static_is_lock_free::thread_id_numeric_size_t>::value == 2; } private: friend struct ProducerToken; friend struct ConsumerToken; struct ExplicitProducer; friend struct ExplicitProducer; struct ImplicitProducer; friend struct ImplicitProducer; friend class ConcurrentQueueTests; enum AllocationMode { CanAlloc, CannotAlloc }; /////////////////////////////// // Queue methods /////////////////////////////// template inline bool inner_enqueue(producer_token_t const& token, U&& element) { return static_cast(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue(std::forward(element)); } template inline bool inner_enqueue(U&& element) { auto producer = get_or_add_implicit_producer(); return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue(std::forward(element)); } template inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) { return static_cast(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue_bulk(itemFirst, count); } template inline bool inner_enqueue_bulk(It itemFirst, size_t count) { auto producer = get_or_add_implicit_producer(); return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk(itemFirst, count); } inline bool update_current_producer_after_rotation(consumer_token_t& token) { // Ah, there's been a rotation, figure out where we should be! auto tail = producerListTail.load(std::memory_order_acquire); if (token.desiredProducer == nullptr && tail == nullptr) { return false; } auto prodCount = producerCount.load(std::memory_order_relaxed); auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed); if ((details::unlikely)(token.desiredProducer == nullptr)) { // Aha, first time we're dequeueing anything. // Figure out our local position // Note: offset is from start, not end, but we're traversing from end -- subtract from count first std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount); token.desiredProducer = tail; for (std::uint32_t i = 0; i != offset; ++i) { token.desiredProducer = static_cast(token.desiredProducer)->next_prod(); if (token.desiredProducer == nullptr) { token.desiredProducer = tail; } } } std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset; if (delta >= prodCount) { delta = delta % prodCount; } for (std::uint32_t i = 0; i != delta; ++i) { token.desiredProducer = static_cast(token.desiredProducer)->next_prod(); if (token.desiredProducer == nullptr) { token.desiredProducer = tail; } } token.lastKnownGlobalOffset = globalOffset; token.currentProducer = token.desiredProducer; token.itemsConsumedFromCurrent = 0; return true; } /////////////////////////// // Free list /////////////////////////// template struct FreeListNode { FreeListNode() : freeListRefs(0), freeListNext(nullptr) { } std::atomic freeListRefs; std::atomic freeListNext; }; // A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but // simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly // speedy under low contention. template // N must inherit FreeListNode or have the same fields (and initialization of them) struct FreeList { FreeList() : freeListHead(nullptr) { } FreeList(FreeList&& other) : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { other.freeListHead.store(nullptr, std::memory_order_relaxed); } void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); } FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION; FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION; inline void add(N* node) { #ifdef MCDBGQ_NOLOCKFREE_FREELIST debug::DebugLock lock(mutex); #endif // We know that the should-be-on-freelist bit is 0 at this point, so it's safe to // set it using a fetch_add if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) { // Oh look! We were the last ones referencing this node, and we know // we want to add it to the free list, so let's do it! add_knowing_refcount_is_zero(node); } } inline N* try_get() { #ifdef MCDBGQ_NOLOCKFREE_FREELIST debug::DebugLock lock(mutex); #endif auto head = freeListHead.load(std::memory_order_acquire); while (head != nullptr) { auto prevHead = head; auto refs = head->freeListRefs.load(std::memory_order_relaxed); if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire, std::memory_order_relaxed)) { head = freeListHead.load(std::memory_order_acquire); continue; } // Good, reference count has been incremented (it wasn't at zero), which means we can read the // next and not worry about it changing between now and the time we do the CAS auto next = head->freeListNext.load(std::memory_order_relaxed); if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) { // Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no // matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on). assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0); // Decrease refcount twice, once for our ref, and once for the list's ref head->freeListRefs.fetch_sub(2, std::memory_order_release); return head; } // OK, the head must have changed on us, but we still need to decrease the refcount we increased. // Note that we don't need to release any memory effects, but we do need to ensure that the reference // count decrement happens-after the CAS on the head. refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel); if (refs == SHOULD_BE_ON_FREELIST + 1) { add_knowing_refcount_is_zero(prevHead); } } return nullptr; } // Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes) N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); } private: inline void add_knowing_refcount_is_zero(N* node) { // Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run // only one copy of this method per node at a time, i.e. the single thread case), then we know // we can safely change the next pointer of the node; however, once the refcount is back above // zero, then other threads could increase it (happens under heavy contention, when the refcount // goes to zero in between a load and a refcount increment of a node in try_get, then back up to // something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS // to add the node to the actual list fails, decrease the refcount and leave the add operation to // the next thread who puts the refcount back at zero (which could be us, hence the loop). auto head = freeListHead.load(std::memory_order_relaxed); while (true) { node->freeListNext.store(head, std::memory_order_relaxed); node->freeListRefs.store(1, std::memory_order_release); if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) { // Hmm, the add failed, but we can only try again when the refcount goes back to zero if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_release) == 1) { continue; } } return; } } private: // Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention) std::atomic freeListHead; static const std::uint32_t REFS_MASK = 0x7FFFFFFF; static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000; #ifdef MCDBGQ_NOLOCKFREE_FREELIST debug::DebugMutex mutex; #endif }; /////////////////////////// // Block /////////////////////////// enum InnerQueueContext { implicit_context = 0, explicit_context = 1 }; struct Block { Block() : next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), dynamicallyAllocated(true) { #ifdef MCDBGQ_TRACKMEM owner = nullptr; #endif } template inline bool is_empty() const { MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { // Check flags for (size_t i = 0; i < BLOCK_SIZE; ++i) { if (!emptyFlags[i].load(std::memory_order_relaxed)) { return false; } } // Aha, empty; make sure we have all other memory effects that happened before the empty flags were set std::atomic_thread_fence(std::memory_order_acquire); return true; } else { // Check counter if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) { std::atomic_thread_fence(std::memory_order_acquire); return true; } assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE); return false; } } // Returns true if the block is now empty (does not apply in explicit context) template inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i) { MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { // Set flag assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1))].load(std::memory_order_relaxed)); emptyFlags[BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1))].store(true, std::memory_order_release); return false; } else { // Increment counter auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_release); assert(prevVal < BLOCK_SIZE); return prevVal == BLOCK_SIZE - 1; } } // Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0). // Returns true if the block is now empty (does not apply in explicit context). template inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i, size_t count) { MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { // Set flags std::atomic_thread_fence(std::memory_order_release); i = BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1)) - count + 1; for (size_t j = 0; j != count; ++j) { assert(!emptyFlags[i + j].load(std::memory_order_relaxed)); emptyFlags[i + j].store(true, std::memory_order_relaxed); } return false; } else { // Increment counter auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_release); assert(prevVal + count <= BLOCK_SIZE); return prevVal + count == BLOCK_SIZE; } } template inline void set_all_empty() { MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { // Set all flags for (size_t i = 0; i != BLOCK_SIZE; ++i) { emptyFlags[i].store(true, std::memory_order_relaxed); } } else { // Reset counter elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed); } } template inline void reset_empty() { MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { // Reset flags for (size_t i = 0; i != BLOCK_SIZE; ++i) { emptyFlags[i].store(false, std::memory_order_relaxed); } } else { // Reset counter elementsCompletelyDequeued.store(0, std::memory_order_relaxed); } } inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT { return static_cast(static_cast(elements)) + static_cast(idx & static_cast(BLOCK_SIZE - 1)); } inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT { return static_cast(static_cast(elements)) + static_cast(idx & static_cast(BLOCK_SIZE - 1)); } private: static_assert(std::alignment_of::value <= sizeof(T), "The queue does not support types with an alignment greater than their size at this time"); MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T) elements; public: Block* next; std::atomic elementsCompletelyDequeued; std::atomic emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1]; public: std::atomic freeListRefs; std::atomic freeListNext; bool dynamicallyAllocated; // Perhaps a better name for this would be 'isNotPartOfInitialBlockPool' #ifdef MCDBGQ_TRACKMEM void* owner; #endif }; static_assert(std::alignment_of::value >= std::alignment_of::value, "Internal error: Blocks must be at least as aligned as the type they are wrapping"); #ifdef MCDBGQ_TRACKMEM public: struct MemStats; private: #endif /////////////////////////// // Producer base /////////////////////////// struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase { ProducerBase(ConcurrentQueue* parent_, bool isExplicit_) : tailIndex(0), headIndex(0), dequeueOptimisticCount(0), dequeueOvercommit(0), tailBlock(nullptr), isExplicit(isExplicit_), parent(parent_) { } virtual ~ProducerBase() { } template inline bool dequeue(U& element) { if (isExplicit) { return static_cast(this)->dequeue(element); } else { return static_cast(this)->dequeue(element); } } template inline size_t dequeue_bulk(It& itemFirst, size_t max) { if (isExplicit) { return static_cast(this)->dequeue_bulk(itemFirst, max); } else { return static_cast(this)->dequeue_bulk(itemFirst, max); } } inline ProducerBase* next_prod() const { return static_cast(next); } inline size_t size_approx() const { auto tail = tailIndex.load(std::memory_order_relaxed); auto head = headIndex.load(std::memory_order_relaxed); return details::circular_less_than(head, tail) ? static_cast(tail - head) : 0; } inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); } protected: std::atomic tailIndex; // Where to enqueue to next std::atomic headIndex; // Where to dequeue from next std::atomic dequeueOptimisticCount; std::atomic dequeueOvercommit; Block* tailBlock; public: bool isExplicit; ConcurrentQueue* parent; protected: #ifdef MCDBGQ_TRACKMEM friend struct MemStats; #endif }; /////////////////////////// // Explicit queue /////////////////////////// struct ExplicitProducer : public ProducerBase { explicit ExplicitProducer(ConcurrentQueue* parent_) : ProducerBase(parent_, true), blockIndex(nullptr), pr_blockIndexSlotsUsed(0), pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1), pr_blockIndexFront(0), pr_blockIndexEntries(nullptr), pr_blockIndexRaw(nullptr) { size_t poolBasedIndexSize = details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1; if (poolBasedIndexSize > pr_blockIndexSize) { pr_blockIndexSize = poolBasedIndexSize; } new_block_index(0); // This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE } ~ExplicitProducer() override { // Destruct any elements not yet dequeued. // Since we're in the destructor, we can assume all elements // are either completely dequeued or completely not (no halfways). if (this->tailBlock != nullptr) { // Note this means there must be a block index too // First find the block that's partially dequeued, if any Block* halfDequeuedBlock = nullptr; if ((this->headIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)) != 0) { // The head's not on a block boundary, meaning a block somewhere is partially dequeued // (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary) size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1); while (details::circular_less_than(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) { i = (i + 1) & (pr_blockIndexSize - 1); } assert(details::circular_less_than(pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed))); halfDequeuedBlock = pr_blockIndexEntries[i].block; } // Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration) auto block = this->tailBlock; do { block = block->next; if (block->ConcurrentQueue::Block::template is_empty()) { continue; } size_t i = 0; // Offset into block if (block == halfDequeuedBlock) { i = static_cast(this->headIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)); } // Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast(this->tailIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)); while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) { (*block)[i++]->~T(); } } while (block != this->tailBlock); } // Destroy all blocks that we own if (this->tailBlock != nullptr) { auto block = this->tailBlock; do { auto nextBlock = block->next; this->parent->add_block_to_free_list(block); block = nextBlock; } while (block != this->tailBlock); } // Destroy the block indices auto header = static_cast(pr_blockIndexRaw); while (header != nullptr) { auto prev = static_cast(header->prev); header->~BlockIndexHeader(); (Traits::free)(header); header = prev; } } template inline bool enqueue(U&& element) { index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed); index_t newTailIndex = 1 + currentTailIndex; if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { // We reached the end of a block, start a new one auto startBlock = this->tailBlock; auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::template is_empty()) { // We can re-use the block ahead of us, it's empty! this->tailBlock = this->tailBlock->next; this->tailBlock->ConcurrentQueue::Block::template reset_empty(); // We'll put the block on the block index (guaranteed to be room since we're conceptually removing the // last block from it first -- except instead of removing then adding, we can just overwrite). // Note that there must be a valid block index here, since even if allocation failed in the ctor, // it would have been re-attempted when adding the first block to the queue; since there is such // a block, a block index must have been successfully allocated. } else { // Whatever head value we see here is >= the last value we saw here (relatively), // and <= its current value. Since we have the most recent tail, the head must be // <= to it. auto head = this->headIndex.load(std::memory_order_relaxed); assert(!details::circular_less_than(currentTailIndex, head)); if (!details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) { // We can't enqueue in another block because there's not enough leeway -- the // tail could surpass the head by the time the block fills up! (Or we'll exceed // the size limit, if the second part of the condition was true.) return false; } // We're going to need a new block; check that the block index has room if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) { // Hmm, the circular block index is already full -- we'll need // to allocate a new index. Note pr_blockIndexRaw can only be nullptr if // the initial allocation failed in the constructor. MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) { return false; } else if (!new_block_index(pr_blockIndexSlotsUsed)) { return false; } } // Insert a new block in the circular linked list auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); if (newBlock == nullptr) { return false; } #ifdef MCDBGQ_TRACKMEM newBlock->owner = this; #endif newBlock->ConcurrentQueue::Block::template reset_empty(); if (this->tailBlock == nullptr) { newBlock->next = newBlock; } else { newBlock->next = this->tailBlock->next; this->tailBlock->next = newBlock; } this->tailBlock = newBlock; ++pr_blockIndexSlotsUsed; } MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast(nullptr)) T(std::forward(element)))) { // The constructor may throw. We want the element not to appear in the queue in // that case (without corrupting the queue): MOODYCAMEL_TRY { new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); } MOODYCAMEL_CATCH (...) { // Revert change to the current block, but leave the new block available // for next time pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; this->tailBlock = startBlock == nullptr ? this->tailBlock : startBlock; MOODYCAMEL_RETHROW; } } else { (void)startBlock; (void)originalBlockIndexSlotsUsed; } // Add block to block index auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; entry.base = currentTailIndex; entry.block = this->tailBlock; blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release); pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast(nullptr)) T(std::forward(element)))) { this->tailIndex.store(newTailIndex, std::memory_order_release); return true; } } // Enqueue new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); this->tailIndex.store(newTailIndex, std::memory_order_release); return true; } template bool dequeue(U& element) { auto tail = this->tailIndex.load(std::memory_order_relaxed); auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); if (details::circular_less_than(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) { // Might be something to dequeue, let's give it a try // Note that this if is purely for performance purposes in the common case when the queue is // empty and the values are eventually consistent -- we may enter here spuriously. // Note that whatever the values of overcommit and tail are, they are not going to change (unless we // change them) and must be the same value at this point (inside the if) as when the if condition was // evaluated. // We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit below. // This ensures that whatever the value we got loaded into overcommit, the load of dequeueOptisticCount in // the fetch_add below will result in a value at least as recent as that (and therefore at least as large). // Note that I believe a compiler (signal) fence here would be sufficient due to the nature of fetch_add (all // read-modify-write operations are guaranteed to work on the latest value in the modification order), but // unfortunately that can't be shown to be correct using only the C++11 standard. // See http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case std::atomic_thread_fence(std::memory_order_acquire); // Increment optimistic counter, then check if it went over the boundary auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed); // Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is only ever // incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), and since we now // have a version of dequeueOptimisticCount that is at least as recent as overcommit (due to the release upon // incrementing dequeueOvercommit and the acquire above that synchronizes with it), overcommit <= myDequeueCount. // However, we can't assert this since both dequeueOptimisticCount and dequeueOvercommit may (independently) // overflow; in such a case, though, the logic still holds since the difference between the two is maintained. // Note that we reload tail here in case it changed; it will be the same value as before or greater, since // this load is sequenced after (happens after) the earlier load above. This is supported by read-read // coherency (as defined in the standard), explained here: http://en.cppreference.com/w/cpp/atomic/memory_order tail = this->tailIndex.load(std::memory_order_acquire); if ((details::likely)(details::circular_less_than(myDequeueCount - overcommit, tail))) { // Guaranteed to be at least one element to dequeue! // Get the index. Note that since there's guaranteed to be at least one element, this // will never exceed tail. We need to do an acquire-release fence here since it's possible // that whatever condition got us to this point was for an earlier enqueued element (that // we already see the memory effects for), but that by the time we increment somebody else // has incremented it, and we need to see the memory effects for *that* element, which is // in such a case is necessarily visible on the thread that incremented it in the first // place with the more current condition (they must have acquired a tail that is at least // as recent). auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel); // Determine which block the element is in auto localBlockIndex = blockIndex.load(std::memory_order_acquire); auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire); // We need to be careful here about subtracting and dividing because of index wrap-around. // When an index wraps, we need to preserve the sign of the offset when dividing it by the // block size (in order to get a correct signed block count offset in all cases): auto headBase = localBlockIndex->entries[localBlockIndexHead].base; auto blockBaseIndex = index & ~static_cast(BLOCK_SIZE - 1); auto offset = static_cast(static_cast::type>(blockBaseIndex - headBase) / static_cast::type>(BLOCK_SIZE)); auto block = localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block; // Dequeue auto& el = *((*block)[index]); if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) { // Make sure the element is still fully dequeued and destroyed even if the assignment // throws struct Guard { Block* block; index_t index; ~Guard() { (*block)[index]->~T(); block->ConcurrentQueue::Block::template set_empty(index); } } guard = { block, index }; element = std::move(el); // NOLINT } else { element = std::move(el); // NOLINT el.~T(); // NOLINT block->ConcurrentQueue::Block::template set_empty(index); } return true; } else { // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent this->dequeueOvercommit.fetch_add(1, std::memory_order_release); // Release so that the fetch_add on dequeueOptimisticCount is guaranteed to happen before this write } } return false; } template bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count) { // First, we need to make sure we have enough room to enqueue all of the elements; // this means pre-allocating blocks and putting them in the block index (but only if // all the allocations succeeded). index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed); auto startBlock = this->tailBlock; auto originalBlockIndexFront = pr_blockIndexFront; auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; Block* firstAllocatedBlock = nullptr; // Figure out how many blocks we'll need to allocate, and do so size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); index_t currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); if (blockBaseDiff > 0) { // Allocate as many blocks as possible from ahead while (blockBaseDiff > 0 && this->tailBlock != nullptr && this->tailBlock->next != firstAllocatedBlock && this->tailBlock->next->ConcurrentQueue::Block::template is_empty()) { blockBaseDiff -= static_cast(BLOCK_SIZE); currentTailIndex += static_cast(BLOCK_SIZE); this->tailBlock = this->tailBlock->next; firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock; auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; entry.base = currentTailIndex; entry.block = this->tailBlock; pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); } // Now allocate as many blocks as necessary from the block pool while (blockBaseDiff > 0) { blockBaseDiff -= static_cast(BLOCK_SIZE); currentTailIndex += static_cast(BLOCK_SIZE); auto head = this->headIndex.load(std::memory_order_relaxed); assert(!details::circular_less_than(currentTailIndex, head)); bool full = !details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head)); if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) { MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) { // Failed to allocate, undo changes (but keep injected blocks) pr_blockIndexFront = originalBlockIndexFront; pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; return false; } else if (full || !new_block_index(originalBlockIndexSlotsUsed)) { // Failed to allocate, undo changes (but keep injected blocks) pr_blockIndexFront = originalBlockIndexFront; pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; return false; } // pr_blockIndexFront is updated inside new_block_index, so we need to // update our fallback value too (since we keep the new index even if we // later fail) originalBlockIndexFront = originalBlockIndexSlotsUsed; } // Insert a new block in the circular linked list auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); if (newBlock == nullptr) { pr_blockIndexFront = originalBlockIndexFront; pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; return false; } #ifdef MCDBGQ_TRACKMEM newBlock->owner = this; #endif newBlock->ConcurrentQueue::Block::template set_all_empty(); if (this->tailBlock == nullptr) { newBlock->next = newBlock; } else { newBlock->next = this->tailBlock->next; this->tailBlock->next = newBlock; } this->tailBlock = newBlock; firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock; ++pr_blockIndexSlotsUsed; auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; entry.base = currentTailIndex; entry.block = this->tailBlock; pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); } // Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and // publish the new block index front auto block = firstAllocatedBlock; while (true) { block->ConcurrentQueue::Block::template reset_empty(); if (block == this->tailBlock) { break; } block = block->next; } MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast(nullptr)) T(details::deref_noexcept(itemFirst)))) { blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release); } } // Enqueue, one block at a time index_t newTailIndex = startTailIndex + static_cast(count); currentTailIndex = startTailIndex; auto endBlock = this->tailBlock; this->tailBlock = startBlock; assert((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0); if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) { this->tailBlock = firstAllocatedBlock; } while (true) { index_t stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); if (details::circular_less_than(newTailIndex, stopIndex)) { stopIndex = newTailIndex; } MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast(nullptr)) T(details::deref_noexcept(itemFirst)))) { while (currentTailIndex != stopIndex) { new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++); } } else { MOODYCAMEL_TRY { while (currentTailIndex != stopIndex) { // Must use copy constructor even if move constructor is available // because we may have to revert if there's an exception. // Sorry about the horrible templated next line, but it was the only way // to disable moving *at compile time*, which is important because a type // may only define a (noexcept) move constructor, and so calls to the // cctor will not compile, even if they are in an if branch that will never // be executed new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst)); ++currentTailIndex; ++itemFirst; } } MOODYCAMEL_CATCH (...) { // Oh dear, an exception's been thrown -- destroy the elements that // were enqueued so far and revert the entire bulk operation (we'll keep // any allocated blocks in our linked list for later, though). auto constructedStopIndex = currentTailIndex; auto lastBlockEnqueued = this->tailBlock; pr_blockIndexFront = originalBlockIndexFront; pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; if (!details::is_trivially_destructible::value) { auto block = startBlock; if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { block = firstAllocatedBlock; } currentTailIndex = startTailIndex; while (true) { stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); if (details::circular_less_than(constructedStopIndex, stopIndex)) { stopIndex = constructedStopIndex; } while (currentTailIndex != stopIndex) { (*block)[currentTailIndex++]->~T(); } if (block == lastBlockEnqueued) { break; } block = block->next; } } MOODYCAMEL_RETHROW; } } if (this->tailBlock == endBlock) { assert(currentTailIndex == newTailIndex); break; } this->tailBlock = this->tailBlock->next; } MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast(nullptr)) T(details::deref_noexcept(itemFirst)))) { if (firstAllocatedBlock != nullptr) blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release); } this->tailIndex.store(newTailIndex, std::memory_order_release); return true; } template size_t dequeue_bulk(It& itemFirst, size_t max) { auto tail = this->tailIndex.load(std::memory_order_relaxed); auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); auto desiredCount = static_cast(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit)); if (details::circular_less_than(0, desiredCount)) { desiredCount = desiredCount < max ? desiredCount : max; std::atomic_thread_fence(std::memory_order_acquire); auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed); tail = this->tailIndex.load(std::memory_order_acquire); auto actualCount = static_cast(tail - (myDequeueCount - overcommit)); if (details::circular_less_than(0, actualCount)) { actualCount = desiredCount < actualCount ? desiredCount : actualCount; if (actualCount < desiredCount) { this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release); } // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this // will never exceed tail. auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel); // Determine which block the first element is in auto localBlockIndex = blockIndex.load(std::memory_order_acquire); auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire); auto headBase = localBlockIndex->entries[localBlockIndexHead].base; auto firstBlockBaseIndex = firstIndex & ~static_cast(BLOCK_SIZE - 1); auto offset = static_cast(static_cast::type>(firstBlockBaseIndex - headBase) / static_cast::type>(BLOCK_SIZE)); auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1); // Iterate the blocks and dequeue auto index = firstIndex; do { auto firstIndexInBlock = index; index_t endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; auto block = localBlockIndex->entries[indexIndex].block; if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) { while (index != endIndex) { auto& el = *((*block)[index]); *itemFirst++ = std::move(el); el.~T(); ++index; } } else { MOODYCAMEL_TRY { while (index != endIndex) { auto& el = *((*block)[index]); *itemFirst = std::move(el); ++itemFirst; el.~T(); ++index; } } MOODYCAMEL_CATCH (...) { // It's too late to revert the dequeue, but we can make sure that all // the dequeued objects are properly destroyed and the block index // (and empty count) are properly updated before we propagate the exception do { block = localBlockIndex->entries[indexIndex].block; while (index != endIndex) { (*block)[index++]->~T(); } block->ConcurrentQueue::Block::template set_many_empty(firstIndexInBlock, static_cast(endIndex - firstIndexInBlock)); indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1); firstIndexInBlock = index; endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; } while (index != firstIndex + actualCount); MOODYCAMEL_RETHROW; } } block->ConcurrentQueue::Block::template set_many_empty(firstIndexInBlock, static_cast(endIndex - firstIndexInBlock)); indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1); } while (index != firstIndex + actualCount); return actualCount; } else { // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release); } } return 0; } private: struct BlockIndexEntry { index_t base; Block* block; }; struct BlockIndexHeader { size_t size; std::atomic front; // Current slot (not next, like pr_blockIndexFront) BlockIndexEntry* entries; void* prev; }; bool new_block_index(size_t numberOfFilledSlotsToExpose) { auto prevBlockSizeMask = pr_blockIndexSize - 1; // Create the new block pr_blockIndexSize <<= 1; auto newRawPtr = static_cast((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize)); if (newRawPtr == nullptr) { pr_blockIndexSize >>= 1; // Reset to allow graceful retry return false; } auto newBlockIndexEntries = reinterpret_cast(details::align_for(newRawPtr + sizeof(BlockIndexHeader))); // Copy in all the old indices, if any size_t j = 0; if (pr_blockIndexSlotsUsed != 0) { auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask; do { newBlockIndexEntries[j++] = pr_blockIndexEntries[i]; i = (i + 1) & prevBlockSizeMask; } while (i != pr_blockIndexFront); } // Update everything auto header = new (newRawPtr) BlockIndexHeader; header->size = pr_blockIndexSize; header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed); header->entries = newBlockIndexEntries; header->prev = pr_blockIndexRaw; // we link the new block to the old one so we can free it later pr_blockIndexFront = j; pr_blockIndexEntries = newBlockIndexEntries; pr_blockIndexRaw = newRawPtr; blockIndex.store(header, std::memory_order_release); return true; } private: std::atomic blockIndex; // To be used by producer only -- consumer must use the ones in referenced by blockIndex size_t pr_blockIndexSlotsUsed; size_t pr_blockIndexSize; size_t pr_blockIndexFront; // Next slot (not current) BlockIndexEntry* pr_blockIndexEntries; void* pr_blockIndexRaw; #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG public: ExplicitProducer* nextExplicitProducer; private: #endif #ifdef MCDBGQ_TRACKMEM friend struct MemStats; #endif }; ////////////////////////////////// // Implicit queue ////////////////////////////////// struct ImplicitProducer : public ProducerBase { ImplicitProducer(ConcurrentQueue* parent_) : ProducerBase(parent_, false), nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE), blockIndex(nullptr) { new_block_index(); } ~ImplicitProducer() override { // Note that since we're in the destructor we can assume that all enqueue/dequeue operations // completed already; this means that all undequeued elements are placed contiguously across // contiguous blocks, and that only the first and last remaining blocks can be only partially // empty (all other remaining blocks must be completely full). #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED // Unregister ourselves for thread termination notification if (!this->inactive.load(std::memory_order_relaxed)) { details::ThreadExitNotifier::unsubscribe(&threadExitListener); } #endif // Destroy all remaining elements! auto tail = this->tailIndex.load(std::memory_order_relaxed); auto index = this->headIndex.load(std::memory_order_relaxed); Block* block = nullptr; assert(index == tail || details::circular_less_than(index, tail)); bool forceFreeLastBlock = index != tail; // If we enter the loop, then the last (tail) block will not be freed while (index != tail) { if ((index & static_cast(BLOCK_SIZE - 1)) == 0 || block == nullptr) { if (block != nullptr) { // Free the old block this->parent->add_block_to_free_list(block); } block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed); } ((*block)[index])->~T(); ++index; } // Even if the queue is empty, there's still one block that's not on the free list // (unless the head index reached the end of it, in which case the tail will be poised // to create a new block). if (this->tailBlock != nullptr && (forceFreeLastBlock || (tail & static_cast(BLOCK_SIZE - 1)) != 0)) { this->parent->add_block_to_free_list(this->tailBlock); } // Destroy block index auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); if (localBlockIndex != nullptr) { for (size_t i = 0; i != localBlockIndex->capacity; ++i) { localBlockIndex->index[i]->~BlockIndexEntry(); } do { auto prev = localBlockIndex->prev; localBlockIndex->~BlockIndexHeader(); (Traits::free)(localBlockIndex); localBlockIndex = prev; } while (localBlockIndex != nullptr); } } template inline bool enqueue(U&& element) { index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed); index_t newTailIndex = 1 + currentTailIndex; if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { // We reached the end of a block, start a new one auto head = this->headIndex.load(std::memory_order_relaxed); assert(!details::circular_less_than(currentTailIndex, head)); if (!details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) { return false; } #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX debug::DebugLock lock(mutex); #endif // Find out where we'll be inserting this block in the block index BlockIndexEntry* idxEntry; if (!insert_block_index_entry(idxEntry, currentTailIndex)) { return false; } // Get ahold of a new block auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); if (newBlock == nullptr) { rewind_block_index_tail(); idxEntry->value.store(nullptr, std::memory_order_relaxed); return false; } #ifdef MCDBGQ_TRACKMEM newBlock->owner = this; #endif newBlock->ConcurrentQueue::Block::template reset_empty(); MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast(nullptr)) T(std::forward(element)))) { // May throw, try to insert now before we publish the fact that we have this new block MOODYCAMEL_TRY { new ((*newBlock)[currentTailIndex]) T(std::forward(element)); } MOODYCAMEL_CATCH (...) { rewind_block_index_tail(); idxEntry->value.store(nullptr, std::memory_order_relaxed); this->parent->add_block_to_free_list(newBlock); MOODYCAMEL_RETHROW; } } // Insert the new block into the index idxEntry->value.store(newBlock, std::memory_order_relaxed); this->tailBlock = newBlock; MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast(nullptr)) T(std::forward(element)))) { this->tailIndex.store(newTailIndex, std::memory_order_release); return true; } } // Enqueue new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); this->tailIndex.store(newTailIndex, std::memory_order_release); return true; } template bool dequeue(U& element) { // See ExplicitProducer::dequeue for rationale and explanation index_t tail = this->tailIndex.load(std::memory_order_relaxed); index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); if (details::circular_less_than(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) { std::atomic_thread_fence(std::memory_order_acquire); index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed); tail = this->tailIndex.load(std::memory_order_acquire); if ((details::likely)(details::circular_less_than(myDequeueCount - overcommit, tail))) { index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel); // Determine which block the element is in auto entry = get_block_index_entry_for_index(index); // Dequeue auto block = entry->value.load(std::memory_order_relaxed); auto& el = *((*block)[index]); if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX // Note: Acquiring the mutex with every dequeue instead of only when a block // is released is very sub-optimal, but it is, after all, purely debug code. debug::DebugLock lock(producer->mutex); #endif struct Guard { Block* block; index_t index; BlockIndexEntry* entry; ConcurrentQueue* parent; ~Guard() { (*block)[index]->~T(); if (block->ConcurrentQueue::Block::template set_empty(index)) { entry->value.store(nullptr, std::memory_order_relaxed); parent->add_block_to_free_list(block); } } } guard = { block, index, entry, this->parent }; element = std::move(el); // NOLINT } else { element = std::move(el); // NOLINT el.~T(); // NOLINT if (block->ConcurrentQueue::Block::template set_empty(index)) { { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX debug::DebugLock lock(mutex); #endif // Add the block back into the global free pool (and remove from block index) entry->value.store(nullptr, std::memory_order_relaxed); } this->parent->add_block_to_free_list(block); // releases the above store } } return true; } else { this->dequeueOvercommit.fetch_add(1, std::memory_order_release); } } return false; } #ifdef _MSC_VER #pragma warning(push) #pragma warning(disable: 4706) // assignment within conditional expression #endif template bool enqueue_bulk(It itemFirst, size_t count) { // First, we need to make sure we have enough room to enqueue all of the elements; // this means pre-allocating blocks and putting them in the block index (but only if // all the allocations succeeded). // Note that the tailBlock we start off with may not be owned by us any more; // this happens if it was filled up exactly to the top (setting tailIndex to // the first index of the next block which is not yet allocated), then dequeued // completely (putting it on the free list) before we enqueue again. index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed); auto startBlock = this->tailBlock; Block* firstAllocatedBlock = nullptr; auto endBlock = this->tailBlock; // Figure out how many blocks we'll need to allocate, and do so size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); index_t currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); if (blockBaseDiff > 0) { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX debug::DebugLock lock(mutex); #endif do { blockBaseDiff -= static_cast(BLOCK_SIZE); currentTailIndex += static_cast(BLOCK_SIZE); // Find out where we'll be inserting this block in the block index BlockIndexEntry* idxEntry = nullptr; // initialization here unnecessary but compiler can't always tell Block* newBlock; bool indexInserted = false; auto head = this->headIndex.load(std::memory_order_relaxed); assert(!details::circular_less_than(currentTailIndex, head)); bool full = !details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head)); if (full || !(indexInserted = insert_block_index_entry(idxEntry, currentTailIndex)) || (newBlock = this->parent->ConcurrentQueue::template requisition_block()) == nullptr) { // Index allocation or block allocation failed; revert any other allocations // and index insertions done so far for this operation if (indexInserted) { rewind_block_index_tail(); idxEntry->value.store(nullptr, std::memory_order_relaxed); } currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) { currentTailIndex += static_cast(BLOCK_SIZE); idxEntry = get_block_index_entry_for_index(currentTailIndex); idxEntry->value.store(nullptr, std::memory_order_relaxed); rewind_block_index_tail(); } this->parent->add_blocks_to_free_list(firstAllocatedBlock); this->tailBlock = startBlock; return false; } #ifdef MCDBGQ_TRACKMEM newBlock->owner = this; #endif newBlock->ConcurrentQueue::Block::template reset_empty(); newBlock->next = nullptr; // Insert the new block into the index idxEntry->value.store(newBlock, std::memory_order_relaxed); // Store the chain of blocks so that we can undo if later allocations fail, // and so that we can find the blocks when we do the actual enqueueing if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr) { assert(this->tailBlock != nullptr); this->tailBlock->next = newBlock; } this->tailBlock = newBlock; endBlock = newBlock; firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock; } while (blockBaseDiff > 0); } // Enqueue, one block at a time index_t newTailIndex = startTailIndex + static_cast(count); currentTailIndex = startTailIndex; this->tailBlock = startBlock; assert((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0); if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) { this->tailBlock = firstAllocatedBlock; } while (true) { index_t stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); if (details::circular_less_than(newTailIndex, stopIndex)) { stopIndex = newTailIndex; } MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast(nullptr)) T(details::deref_noexcept(itemFirst)))) { while (currentTailIndex != stopIndex) { new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++); } } else { MOODYCAMEL_TRY { while (currentTailIndex != stopIndex) { new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst)); ++currentTailIndex; ++itemFirst; } } MOODYCAMEL_CATCH (...) { auto constructedStopIndex = currentTailIndex; auto lastBlockEnqueued = this->tailBlock; if (!details::is_trivially_destructible::value) { auto block = startBlock; if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { block = firstAllocatedBlock; } currentTailIndex = startTailIndex; while (true) { stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); if (details::circular_less_than(constructedStopIndex, stopIndex)) { stopIndex = constructedStopIndex; } while (currentTailIndex != stopIndex) { (*block)[currentTailIndex++]->~T(); } if (block == lastBlockEnqueued) { break; } block = block->next; } } currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) { currentTailIndex += static_cast(BLOCK_SIZE); auto idxEntry = get_block_index_entry_for_index(currentTailIndex); idxEntry->value.store(nullptr, std::memory_order_relaxed); rewind_block_index_tail(); } this->parent->add_blocks_to_free_list(firstAllocatedBlock); this->tailBlock = startBlock; MOODYCAMEL_RETHROW; } } if (this->tailBlock == endBlock) { assert(currentTailIndex == newTailIndex); break; } this->tailBlock = this->tailBlock->next; } this->tailIndex.store(newTailIndex, std::memory_order_release); return true; } #ifdef _MSC_VER #pragma warning(pop) #endif template size_t dequeue_bulk(It& itemFirst, size_t max) { auto tail = this->tailIndex.load(std::memory_order_relaxed); auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); auto desiredCount = static_cast(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit)); if (details::circular_less_than(0, desiredCount)) { desiredCount = desiredCount < max ? desiredCount : max; std::atomic_thread_fence(std::memory_order_acquire); auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed); tail = this->tailIndex.load(std::memory_order_acquire); auto actualCount = static_cast(tail - (myDequeueCount - overcommit)); if (details::circular_less_than(0, actualCount)) { actualCount = desiredCount < actualCount ? desiredCount : actualCount; if (actualCount < desiredCount) { this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release); } // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this // will never exceed tail. auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel); // Iterate the blocks and dequeue auto index = firstIndex; BlockIndexHeader* localBlockIndex; auto indexIndex = get_block_index_index_for_index(index, localBlockIndex); do { auto blockStartIndex = index; index_t endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; auto entry = localBlockIndex->index[indexIndex]; auto block = entry->value.load(std::memory_order_relaxed); if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) { while (index != endIndex) { auto& el = *((*block)[index]); *itemFirst++ = std::move(el); el.~T(); ++index; } } else { MOODYCAMEL_TRY { while (index != endIndex) { auto& el = *((*block)[index]); *itemFirst = std::move(el); ++itemFirst; el.~T(); ++index; } } MOODYCAMEL_CATCH (...) { do { entry = localBlockIndex->index[indexIndex]; block = entry->value.load(std::memory_order_relaxed); while (index != endIndex) { (*block)[index++]->~T(); } if (block->ConcurrentQueue::Block::template set_many_empty(blockStartIndex, static_cast(endIndex - blockStartIndex))) { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX debug::DebugLock lock(mutex); #endif entry->value.store(nullptr, std::memory_order_relaxed); this->parent->add_block_to_free_list(block); } indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1); blockStartIndex = index; endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; } while (index != firstIndex + actualCount); MOODYCAMEL_RETHROW; } } if (block->ConcurrentQueue::Block::template set_many_empty(blockStartIndex, static_cast(endIndex - blockStartIndex))) { { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX debug::DebugLock lock(mutex); #endif // Note that the set_many_empty above did a release, meaning that anybody who acquires the block // we're about to free can use it safely since our writes (and reads!) will have happened-before then. entry->value.store(nullptr, std::memory_order_relaxed); } this->parent->add_block_to_free_list(block); // releases the above store } indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1); } while (index != firstIndex + actualCount); return actualCount; } else { this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release); } } return 0; } private: // The block size must be > 1, so any number with the low bit set is an invalid block base index static const index_t INVALID_BLOCK_BASE = 1; struct BlockIndexEntry { std::atomic key; std::atomic value; }; struct BlockIndexHeader { size_t capacity; std::atomic tail; BlockIndexEntry* entries; BlockIndexEntry** index; BlockIndexHeader* prev; }; template inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex) { auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); // We're the only writer thread, relaxed is OK if (localBlockIndex == nullptr) { return false; // this can happen if new_block_index failed in the constructor } size_t newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1); idxEntry = localBlockIndex->index[newTail]; if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE || idxEntry->value.load(std::memory_order_relaxed) == nullptr) { idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); localBlockIndex->tail.store(newTail, std::memory_order_release); return true; } // No room in the old block index, try to allocate another one! MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) { return false; } else if (!new_block_index()) { return false; } else { localBlockIndex = blockIndex.load(std::memory_order_relaxed); newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1); idxEntry = localBlockIndex->index[newTail]; assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE); idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); localBlockIndex->tail.store(newTail, std::memory_order_release); return true; } } inline void rewind_block_index_tail() { auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); localBlockIndex->tail.store((localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1), std::memory_order_relaxed); } inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const { BlockIndexHeader* localBlockIndex; auto idx = get_block_index_index_for_index(index, localBlockIndex); return localBlockIndex->index[idx]; } inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX debug::DebugLock lock(mutex); #endif index &= ~static_cast(BLOCK_SIZE - 1); localBlockIndex = blockIndex.load(std::memory_order_acquire); auto tail = localBlockIndex->tail.load(std::memory_order_acquire); auto tailBase = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed); assert(tailBase != INVALID_BLOCK_BASE); // Note: Must use division instead of shift because the index may wrap around, causing a negative // offset, whose negativity we want to preserve auto offset = static_cast(static_cast::type>(index - tailBase) / static_cast::type>(BLOCK_SIZE)); size_t idx = (tail + offset) & (localBlockIndex->capacity - 1); assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index && localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr); return idx; } bool new_block_index() { auto prev = blockIndex.load(std::memory_order_relaxed); size_t prevCapacity = prev == nullptr ? 0 : prev->capacity; auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity; auto raw = static_cast((Traits::malloc)( sizeof(BlockIndexHeader) + std::alignment_of::value - 1 + sizeof(BlockIndexEntry) * entryCount + std::alignment_of::value - 1 + sizeof(BlockIndexEntry*) * nextBlockIndexCapacity)); if (raw == nullptr) { return false; } auto header = new (raw) BlockIndexHeader; auto entries = reinterpret_cast(details::align_for(raw + sizeof(BlockIndexHeader))); auto index = reinterpret_cast(details::align_for(reinterpret_cast(entries) + sizeof(BlockIndexEntry) * entryCount)); if (prev != nullptr) { auto prevTail = prev->tail.load(std::memory_order_relaxed); auto prevPos = prevTail; size_t i = 0; do { prevPos = (prevPos + 1) & (prev->capacity - 1); index[i++] = prev->index[prevPos]; } while (prevPos != prevTail); assert(i == prevCapacity); } for (size_t i = 0; i != entryCount; ++i) { new (entries + i) BlockIndexEntry; entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed); index[prevCapacity + i] = entries + i; } header->prev = prev; header->entries = entries; header->index = index; header->capacity = nextBlockIndexCapacity; header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed); blockIndex.store(header, std::memory_order_release); nextBlockIndexCapacity <<= 1; return true; } private: size_t nextBlockIndexCapacity; std::atomic blockIndex; #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED public: details::ThreadExitListener threadExitListener; private: #endif #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG public: ImplicitProducer* nextImplicitProducer; private: #endif #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX mutable debug::DebugMutex mutex; #endif #ifdef MCDBGQ_TRACKMEM friend struct MemStats; #endif }; ////////////////////////////////// // Block pool manipulation ////////////////////////////////// void populate_initial_block_list(size_t blockCount) { initialBlockPoolSize = blockCount; if (initialBlockPoolSize == 0) { initialBlockPool = nullptr; return; } initialBlockPool = create_array(blockCount); if (initialBlockPool == nullptr) { initialBlockPoolSize = 0; } for (size_t i = 0; i < initialBlockPoolSize; ++i) { initialBlockPool[i].dynamicallyAllocated = false; } } inline Block* try_get_block_from_initial_pool() { if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) { return nullptr; } auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed); return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr; } inline void add_block_to_free_list(Block* block) { #ifdef MCDBGQ_TRACKMEM block->owner = nullptr; #endif if (!Traits::RECYCLE_ALLOCATED_BLOCKS && block->dynamicallyAllocated) { destroy(block); } else { freeList.add(block); } } inline void add_blocks_to_free_list(Block* block) { while (block != nullptr) { auto next = block->next; add_block_to_free_list(block); block = next; } } inline Block* try_get_block_from_free_list() { return freeList.try_get(); } // Gets a free block from one of the memory pools, or allocates a new one (if applicable) template Block* requisition_block() { auto block = try_get_block_from_initial_pool(); if (block != nullptr) { return block; } block = try_get_block_from_free_list(); if (block != nullptr) { return block; } MOODYCAMEL_CONSTEXPR_IF (canAlloc == CanAlloc) { return create(); } else { return nullptr; } } #ifdef MCDBGQ_TRACKMEM public: struct MemStats { size_t allocatedBlocks; size_t usedBlocks; size_t freeBlocks; size_t ownedBlocksExplicit; size_t ownedBlocksImplicit; size_t implicitProducers; size_t explicitProducers; size_t elementsEnqueued; size_t blockClassBytes; size_t queueClassBytes; size_t implicitBlockIndexBytes; size_t explicitBlockIndexBytes; friend class ConcurrentQueue; private: static MemStats getFor(ConcurrentQueue* q) { MemStats stats = { 0 }; stats.elementsEnqueued = q->size_approx(); auto block = q->freeList.head_unsafe(); while (block != nullptr) { ++stats.allocatedBlocks; ++stats.freeBlocks; block = block->freeListNext.load(std::memory_order_relaxed); } for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { bool implicit = dynamic_cast(ptr) != nullptr; stats.implicitProducers += implicit ? 1 : 0; stats.explicitProducers += implicit ? 0 : 1; if (implicit) { auto prod = static_cast(ptr); stats.queueClassBytes += sizeof(ImplicitProducer); auto head = prod->headIndex.load(std::memory_order_relaxed); auto tail = prod->tailIndex.load(std::memory_order_relaxed); auto hash = prod->blockIndex.load(std::memory_order_relaxed); if (hash != nullptr) { for (size_t i = 0; i != hash->capacity; ++i) { if (hash->index[i]->key.load(std::memory_order_relaxed) != ImplicitProducer::INVALID_BLOCK_BASE && hash->index[i]->value.load(std::memory_order_relaxed) != nullptr) { ++stats.allocatedBlocks; ++stats.ownedBlocksImplicit; } } stats.implicitBlockIndexBytes += hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry); for (; hash != nullptr; hash = hash->prev) { stats.implicitBlockIndexBytes += sizeof(typename ImplicitProducer::BlockIndexHeader) + hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*); } } for (; details::circular_less_than(head, tail); head += BLOCK_SIZE) { //auto block = prod->get_block_index_entry_for_index(head); ++stats.usedBlocks; } } else { auto prod = static_cast(ptr); stats.queueClassBytes += sizeof(ExplicitProducer); auto tailBlock = prod->tailBlock; bool wasNonEmpty = false; if (tailBlock != nullptr) { auto block = tailBlock; do { ++stats.allocatedBlocks; if (!block->ConcurrentQueue::Block::template is_empty() || wasNonEmpty) { ++stats.usedBlocks; wasNonEmpty = wasNonEmpty || block != tailBlock; } ++stats.ownedBlocksExplicit; block = block->next; } while (block != tailBlock); } auto index = prod->blockIndex.load(std::memory_order_relaxed); while (index != nullptr) { stats.explicitBlockIndexBytes += sizeof(typename ExplicitProducer::BlockIndexHeader) + index->size * sizeof(typename ExplicitProducer::BlockIndexEntry); index = static_cast(index->prev); } } } auto freeOnInitialPool = q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ? 0 : q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed); stats.allocatedBlocks += freeOnInitialPool; stats.freeBlocks += freeOnInitialPool; stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks; stats.queueClassBytes += sizeof(ConcurrentQueue); return stats; } }; // For debugging only. Not thread-safe. MemStats getMemStats() { return MemStats::getFor(this); } private: friend struct MemStats; #endif ////////////////////////////////// // Producer list manipulation ////////////////////////////////// ProducerBase* recycle_or_create_producer(bool isExplicit) { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH debug::DebugLock lock(implicitProdMutex); #endif // Try to re-use one first for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit) { bool expected = true; if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) { // We caught one! It's been marked as activated, the caller can have it return ptr; } } } return add_producer(isExplicit ? static_cast(create(this)) : create(this)); } ProducerBase* add_producer(ProducerBase* producer) { // Handle failed memory allocation if (producer == nullptr) { return nullptr; } producerCount.fetch_add(1, std::memory_order_relaxed); // Add it to the lock-free list auto prevTail = producerListTail.load(std::memory_order_relaxed); do { producer->next = prevTail; } while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed)); #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG if (producer->isExplicit) { auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed); do { static_cast(producer)->nextExplicitProducer = prevTailExplicit; } while (!explicitProducers.compare_exchange_weak(prevTailExplicit, static_cast(producer), std::memory_order_release, std::memory_order_relaxed)); } else { auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed); do { static_cast(producer)->nextImplicitProducer = prevTailImplicit; } while (!implicitProducers.compare_exchange_weak(prevTailImplicit, static_cast(producer), std::memory_order_release, std::memory_order_relaxed)); } #endif return producer; } void reown_producers() { // After another instance is moved-into/swapped-with this one, all the // producers we stole still think their parents are the other queue. // So fix them up! for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) { ptr->parent = this; } } ////////////////////////////////// // Implicit producer hash ////////////////////////////////// struct ImplicitProducerKVP { std::atomic key; ImplicitProducer* value; // No need for atomicity since it's only read by the thread that sets it in the first place ImplicitProducerKVP() : value(nullptr) { } ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT { key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed); value = other.value; } inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT { swap(other); return *this; } inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT { if (this != &other) { details::swap_relaxed(key, other.key); std::swap(value, other.value); } } }; template friend void moodycamel::swap(typename ConcurrentQueue::ImplicitProducerKVP&, typename ConcurrentQueue::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT; struct ImplicitProducerHash { size_t capacity; ImplicitProducerKVP* entries; ImplicitProducerHash* prev; }; inline void populate_initial_implicit_producer_hash() { MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) { return; } else { implicitProducerHashCount.store(0, std::memory_order_relaxed); auto hash = &initialImplicitProducerHash; hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; hash->entries = &initialImplicitProducerHashEntries[0]; for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) { initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed); } hash->prev = nullptr; implicitProducerHash.store(hash, std::memory_order_relaxed); } } void swap_implicit_producer_hashes(ConcurrentQueue& other) { MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) { return; } else { // Swap (assumes our implicit producer hash is initialized) initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries); initialImplicitProducerHash.entries = &initialImplicitProducerHashEntries[0]; other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0]; details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount); details::swap_relaxed(implicitProducerHash, other.implicitProducerHash); if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) { implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed); } else { ImplicitProducerHash* hash; for (hash = implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &other.initialImplicitProducerHash; hash = hash->prev) { continue; } hash->prev = &initialImplicitProducerHash; } if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) { other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed); } else { ImplicitProducerHash* hash; for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &initialImplicitProducerHash; hash = hash->prev) { continue; } hash->prev = &other.initialImplicitProducerHash; } } } // Only fails (returns nullptr) if memory allocation fails ImplicitProducer* get_or_add_implicit_producer() { // Note that since the data is essentially thread-local (key is thread ID), // there's a reduced need for fences (memory ordering is already consistent // for any individual thread), except for the current table itself. // Start by looking for the thread ID in the current and all previous hash tables. // If it's not found, it must not be in there yet, since this same thread would // have added it previously to one of the tables that we traversed. // Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH debug::DebugLock lock(implicitProdMutex); #endif auto id = details::thread_id(); auto hashedId = details::hash_thread_id(id); auto mainHash = implicitProducerHash.load(std::memory_order_acquire); assert(mainHash != nullptr); // silence clang-tidy and MSVC warnings (hash cannot be null) for (auto hash = mainHash; hash != nullptr; hash = hash->prev) { // Look for the id in this hash auto index = hashedId; while (true) { // Not an infinite loop because at least one slot is free in the hash table index &= hash->capacity - 1u; auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed); if (probedKey == id) { // Found it! If we had to search several hashes deep, though, we should lazily add it // to the current main hash table to avoid the extended search next time. // Note there's guaranteed to be room in the current hash table since every subsequent // table implicitly reserves space for all previous tables (there's only one // implicitProducerHashCount). auto value = hash->entries[index].value; if (hash != mainHash) { index = hashedId; while (true) { index &= mainHash->capacity - 1u; auto empty = details::invalid_thread_id; #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED auto reusable = details::invalid_thread_id2; if (mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_seq_cst, std::memory_order_relaxed) || mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) { #else if (mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_seq_cst, std::memory_order_relaxed)) { #endif mainHash->entries[index].value = value; break; } ++index; } } return value; } if (probedKey == details::invalid_thread_id) { break; // Not in this hash table } ++index; } } // Insert! auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed); while (true) { // NOLINTNEXTLINE(clang-analyzer-core.NullDereference) if (newCount >= (mainHash->capacity >> 1) && !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire)) { // We've acquired the resize lock, try to allocate a bigger hash table. // Note the acquire fence synchronizes with the release fence at the end of this block, and hence when // we reload implicitProducerHash it must be the most recent version (it only gets changed within this // locked block). mainHash = implicitProducerHash.load(std::memory_order_acquire); if (newCount >= (mainHash->capacity >> 1)) { size_t newCapacity = mainHash->capacity << 1; while (newCount >= (newCapacity >> 1)) { newCapacity <<= 1; } auto raw = static_cast((Traits::malloc)(sizeof(ImplicitProducerHash) + std::alignment_of::value - 1 + sizeof(ImplicitProducerKVP) * newCapacity)); if (raw == nullptr) { // Allocation failed implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); return nullptr; } auto newHash = new (raw) ImplicitProducerHash; newHash->capacity = static_cast(newCapacity); newHash->entries = reinterpret_cast(details::align_for(raw + sizeof(ImplicitProducerHash))); for (size_t i = 0; i != newCapacity; ++i) { new (newHash->entries + i) ImplicitProducerKVP; newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed); } newHash->prev = mainHash; implicitProducerHash.store(newHash, std::memory_order_release); implicitProducerHashResizeInProgress.clear(std::memory_order_release); mainHash = newHash; } else { implicitProducerHashResizeInProgress.clear(std::memory_order_release); } } // If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table // to finish being allocated by another thread (and if we just finished allocating above, the condition will // always be true) if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) { auto producer = static_cast(recycle_or_create_producer(false)); if (producer == nullptr) { implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); return nullptr; } #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback; producer->threadExitListener.userData = producer; details::ThreadExitNotifier::subscribe(&producer->threadExitListener); #endif auto index = hashedId; while (true) { index &= mainHash->capacity - 1u; auto empty = details::invalid_thread_id; #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED auto reusable = details::invalid_thread_id2; if (mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) { implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); // already counted as a used slot mainHash->entries[index].value = producer; break; } #endif if (mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_seq_cst, std::memory_order_relaxed)) { mainHash->entries[index].value = producer; break; } ++index; } return producer; } // Hmm, the old hash is quite full and somebody else is busy allocating a new one. // We need to wait for the allocating thread to finish (if it succeeds, we add, if not, // we try to allocate ourselves). mainHash = implicitProducerHash.load(std::memory_order_acquire); } } #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED void implicit_producer_thread_exited(ImplicitProducer* producer) { // Remove from hash #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH debug::DebugLock lock(implicitProdMutex); #endif auto hash = implicitProducerHash.load(std::memory_order_acquire); assert(hash != nullptr); // The thread exit listener is only registered if we were added to a hash in the first place auto id = details::thread_id(); auto hashedId = details::hash_thread_id(id); details::thread_id_t probedKey; // We need to traverse all the hashes just in case other threads aren't on the current one yet and are // trying to add an entry thinking there's a free slot (because they reused a producer) for (; hash != nullptr; hash = hash->prev) { auto index = hashedId; do { index &= hash->capacity - 1u; probedKey = id; if (hash->entries[index].key.compare_exchange_strong(probedKey, details::invalid_thread_id2, std::memory_order_seq_cst, std::memory_order_relaxed)) { break; } ++index; } while (probedKey != details::invalid_thread_id); // Can happen if the hash has changed but we weren't put back in it yet, or if we weren't added to this hash in the first place } // Mark the queue as being recyclable producer->inactive.store(true, std::memory_order_release); } static void implicit_producer_thread_exited_callback(void* userData) { auto producer = static_cast(userData); auto queue = producer->parent; queue->implicit_producer_thread_exited(producer); } #endif ////////////////////////////////// // Utility functions ////////////////////////////////// template static inline void* aligned_malloc(size_t size) { MOODYCAMEL_CONSTEXPR_IF (std::alignment_of::value <= std::alignment_of::value) return (Traits::malloc)(size); else { size_t alignment = std::alignment_of::value; void* raw = (Traits::malloc)(size + alignment - 1 + sizeof(void*)); if (!raw) return nullptr; char* ptr = details::align_for(reinterpret_cast(raw) + sizeof(void*)); *(reinterpret_cast(ptr) - 1) = raw; return ptr; } } template static inline void aligned_free(void* ptr) { MOODYCAMEL_CONSTEXPR_IF (std::alignment_of::value <= std::alignment_of::value) return (Traits::free)(ptr); else (Traits::free)(ptr ? *(reinterpret_cast(ptr) - 1) : nullptr); } template static inline U* create_array(size_t count) { assert(count > 0); U* p = static_cast(aligned_malloc(sizeof(U) * count)); if (p == nullptr) return nullptr; for (size_t i = 0; i != count; ++i) new (p + i) U(); return p; } template static inline void destroy_array(U* p, size_t count) { if (p != nullptr) { assert(count > 0); for (size_t i = count; i != 0; ) (p + --i)->~U(); } aligned_free(p); } template static inline U* create() { void* p = aligned_malloc(sizeof(U)); return p != nullptr ? new (p) U : nullptr; } template static inline U* create(A1&& a1) { void* p = aligned_malloc(sizeof(U)); return p != nullptr ? new (p) U(std::forward(a1)) : nullptr; } template static inline void destroy(U* p) { if (p != nullptr) p->~U(); aligned_free(p); } private: std::atomic producerListTail; std::atomic producerCount; std::atomic initialBlockPoolIndex; Block* initialBlockPool; size_t initialBlockPoolSize; #ifndef MCDBGQ_USEDEBUGFREELIST FreeList freeList; #else debug::DebugFreeList freeList; #endif std::atomic implicitProducerHash; std::atomic implicitProducerHashCount; // Number of slots logically used ImplicitProducerHash initialImplicitProducerHash; std::array initialImplicitProducerHashEntries; std::atomic_flag implicitProducerHashResizeInProgress; std::atomic nextExplicitConsumerId; std::atomic globalExplicitConsumerOffset; #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH debug::DebugMutex implicitProdMutex; #endif #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG std::atomic explicitProducers; std::atomic implicitProducers; #endif }; template ProducerToken::ProducerToken(ConcurrentQueue& queue) : producer(queue.recycle_or_create_producer(true)) { if (producer != nullptr) { producer->token = this; } } template ProducerToken::ProducerToken(BlockingConcurrentQueue& queue) : producer(reinterpret_cast*>(&queue)->recycle_or_create_producer(true)) { if (producer != nullptr) { producer->token = this; } } template ConsumerToken::ConsumerToken(ConcurrentQueue& queue) : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr) { initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release); lastKnownGlobalOffset = static_cast(-1); } template ConsumerToken::ConsumerToken(BlockingConcurrentQueue& queue) : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr) { initialOffset = reinterpret_cast*>(&queue)->nextExplicitConsumerId.fetch_add(1, std::memory_order_release); lastKnownGlobalOffset = static_cast(-1); } template inline void swap(ConcurrentQueue& a, ConcurrentQueue& b) MOODYCAMEL_NOEXCEPT { a.swap(b); } inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT { a.swap(b); } inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT { a.swap(b); } template inline void swap(typename ConcurrentQueue::ImplicitProducerKVP& a, typename ConcurrentQueue::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT { a.swap(b); } } #if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17) #pragma warning(pop) #endif #if defined(__GNUC__) && !defined(__INTEL_COMPILER) #pragma GCC diagnostic pop #endif ================================================ FILE: dispenso/third-party/moodycamel/lightweightsemaphore.h ================================================ // Provides an efficient implementation of a semaphore (LightweightSemaphore). // This is an extension of Jeff Preshing's sempahore implementation (licensed // under the terms of its separate zlib license) that has been adapted and // extended by Cameron Desrochers. #pragma once #include // For std::size_t #include #include // For std::make_signed #if defined(_WIN32) // Avoid including windows.h in a header; we only need a handful of // items, so we'll redeclare them here (this is relatively safe since // the API generally has to remain stable between Windows versions). // I know this is an ugly hack but it still beats polluting the global // namespace with thousands of generic names or adding a .cpp for nothing. extern "C" { struct _SECURITY_ATTRIBUTES; __declspec(dllimport) void* __stdcall CreateSemaphoreW(_SECURITY_ATTRIBUTES* lpSemaphoreAttributes, long lInitialCount, long lMaximumCount, const wchar_t* lpName); __declspec(dllimport) int __stdcall CloseHandle(void* hObject); __declspec(dllimport) unsigned long __stdcall WaitForSingleObject(void* hHandle, unsigned long dwMilliseconds); __declspec(dllimport) int __stdcall ReleaseSemaphore(void* hSemaphore, long lReleaseCount, long* lpPreviousCount); } #elif defined(__MACH__) #include #elif defined(__unix__) #include #if defined(__GLIBC_PREREQ) && defined(_GNU_SOURCE) #if __GLIBC_PREREQ(2,30) #define MOODYCAMEL_LIGHTWEIGHTSEMAPHORE_MONOTONIC #endif #endif #endif namespace moodycamel { namespace details { // Code in the mpmc_sema namespace below is an adaptation of Jeff Preshing's // portable + lightweight semaphore implementations, originally from // https://github.com/preshing/cpp11-on-multicore/blob/master/common/sema.h // LICENSE: // Copyright (c) 2015 Jeff Preshing // // This software is provided 'as-is', without any express or implied // warranty. In no event will the authors be held liable for any damages // arising from the use of this software. // // Permission is granted to anyone to use this software for any purpose, // including commercial applications, and to alter it and redistribute it // freely, subject to the following restrictions: // // 1. The origin of this software must not be misrepresented; you must not // claim that you wrote the original software. If you use this software // in a product, an acknowledgement in the product documentation would be // appreciated but is not required. // 2. Altered source versions must be plainly marked as such, and must not be // misrepresented as being the original software. // 3. This notice may not be removed or altered from any source distribution. #if defined(_WIN32) class Semaphore { private: void* m_hSema; Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; public: Semaphore(int initialCount = 0) { assert(initialCount >= 0); const long maxLong = 0x7fffffff; m_hSema = CreateSemaphoreW(nullptr, initialCount, maxLong, nullptr); assert(m_hSema); } ~Semaphore() { CloseHandle(m_hSema); } bool wait() { const unsigned long infinite = 0xffffffff; return WaitForSingleObject(m_hSema, infinite) == 0; } bool try_wait() { return WaitForSingleObject(m_hSema, 0) == 0; } bool timed_wait(std::uint64_t usecs) { return WaitForSingleObject(m_hSema, (unsigned long)(usecs / 1000)) == 0; } void signal(int count = 1) { while (!ReleaseSemaphore(m_hSema, count, nullptr)); } }; #elif defined(__MACH__) //--------------------------------------------------------- // Semaphore (Apple iOS and OSX) // Can't use POSIX semaphores due to http://lists.apple.com/archives/darwin-kernel/2009/Apr/msg00010.html //--------------------------------------------------------- class Semaphore { private: semaphore_t m_sema; Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; public: Semaphore(int initialCount = 0) { assert(initialCount >= 0); kern_return_t rc = semaphore_create(mach_task_self(), &m_sema, SYNC_POLICY_FIFO, initialCount); assert(rc == KERN_SUCCESS); (void)rc; } ~Semaphore() { semaphore_destroy(mach_task_self(), m_sema); } bool wait() { return semaphore_wait(m_sema) == KERN_SUCCESS; } bool try_wait() { return timed_wait(0); } bool timed_wait(std::uint64_t timeout_usecs) { mach_timespec_t ts; ts.tv_sec = static_cast(timeout_usecs / 1000000); ts.tv_nsec = static_cast((timeout_usecs % 1000000) * 1000); // added in OSX 10.10: https://developer.apple.com/library/prerelease/mac/documentation/General/Reference/APIDiffsMacOSX10_10SeedDiff/modules/Darwin.html kern_return_t rc = semaphore_timedwait(m_sema, ts); return rc == KERN_SUCCESS; } void signal() { while (semaphore_signal(m_sema) != KERN_SUCCESS); } void signal(int count) { while (count-- > 0) { while (semaphore_signal(m_sema) != KERN_SUCCESS); } } }; #elif defined(__unix__) //--------------------------------------------------------- // Semaphore (POSIX, Linux) //--------------------------------------------------------- class Semaphore { private: sem_t m_sema; Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; public: Semaphore(int initialCount = 0) { assert(initialCount >= 0); int rc = sem_init(&m_sema, 0, static_cast(initialCount)); assert(rc == 0); (void)rc; } ~Semaphore() { sem_destroy(&m_sema); } bool wait() { // http://stackoverflow.com/questions/2013181/gdb-causes-sem-wait-to-fail-with-eintr-error int rc; do { rc = sem_wait(&m_sema); } while (rc == -1 && errno == EINTR); return rc == 0; } bool try_wait() { int rc; do { rc = sem_trywait(&m_sema); } while (rc == -1 && errno == EINTR); return rc == 0; } bool timed_wait(std::uint64_t usecs) { struct timespec ts; const int usecs_in_1_sec = 1000000; const int nsecs_in_1_sec = 1000000000; #ifdef MOODYCAMEL_LIGHTWEIGHTSEMAPHORE_MONOTONIC clock_gettime(CLOCK_MONOTONIC, &ts); #else clock_gettime(CLOCK_REALTIME, &ts); #endif ts.tv_sec += (time_t)(usecs / usecs_in_1_sec); ts.tv_nsec += (long)(usecs % usecs_in_1_sec) * 1000; // sem_timedwait bombs if you have more than 1e9 in tv_nsec // so we have to clean things up before passing it in if (ts.tv_nsec >= nsecs_in_1_sec) { ts.tv_nsec -= nsecs_in_1_sec; ++ts.tv_sec; } int rc; do { #ifdef MOODYCAMEL_LIGHTWEIGHTSEMAPHORE_MONOTONIC rc = sem_clockwait(&m_sema, CLOCK_MONOTONIC, &ts); #else rc = sem_timedwait(&m_sema, &ts); #endif } while (rc == -1 && errno == EINTR); return rc == 0; } void signal() { while (sem_post(&m_sema) == -1); } void signal(int count) { while (count-- > 0) { while (sem_post(&m_sema) == -1); } } }; #else #error Unsupported platform! (No semaphore wrapper available) #endif } // end namespace details //--------------------------------------------------------- // LightweightSemaphore //--------------------------------------------------------- class LightweightSemaphore { public: typedef std::make_signed::type ssize_t; private: std::atomic m_count; details::Semaphore m_sema; int m_maxSpins; bool waitWithPartialSpinning(std::int64_t timeout_usecs = -1) { ssize_t oldCount; int spin = m_maxSpins; while (--spin >= 0) { oldCount = m_count.load(std::memory_order_relaxed); if ((oldCount > 0) && m_count.compare_exchange_strong(oldCount, oldCount - 1, std::memory_order_acquire, std::memory_order_relaxed)) return true; std::atomic_signal_fence(std::memory_order_acquire); // Prevent the compiler from collapsing the loop. } oldCount = m_count.fetch_sub(1, std::memory_order_acquire); if (oldCount > 0) return true; if (timeout_usecs < 0) { if (m_sema.wait()) return true; } if (timeout_usecs > 0 && m_sema.timed_wait((std::uint64_t)timeout_usecs)) return true; // At this point, we've timed out waiting for the semaphore, but the // count is still decremented indicating we may still be waiting on // it. So we have to re-adjust the count, but only if the semaphore // wasn't signaled enough times for us too since then. If it was, we // need to release the semaphore too. while (true) { oldCount = m_count.load(std::memory_order_acquire); if (oldCount >= 0 && m_sema.try_wait()) return true; if (oldCount < 0 && m_count.compare_exchange_strong(oldCount, oldCount + 1, std::memory_order_relaxed, std::memory_order_relaxed)) return false; } } ssize_t waitManyWithPartialSpinning(ssize_t max, std::int64_t timeout_usecs = -1) { assert(max > 0); ssize_t oldCount; int spin = m_maxSpins; while (--spin >= 0) { oldCount = m_count.load(std::memory_order_relaxed); if (oldCount > 0) { ssize_t newCount = oldCount > max ? oldCount - max : 0; if (m_count.compare_exchange_strong(oldCount, newCount, std::memory_order_acquire, std::memory_order_relaxed)) return oldCount - newCount; } std::atomic_signal_fence(std::memory_order_acquire); } oldCount = m_count.fetch_sub(1, std::memory_order_acquire); if (oldCount <= 0) { if ((timeout_usecs == 0) || (timeout_usecs < 0 && !m_sema.wait()) || (timeout_usecs > 0 && !m_sema.timed_wait((std::uint64_t)timeout_usecs))) { while (true) { oldCount = m_count.load(std::memory_order_acquire); if (oldCount >= 0 && m_sema.try_wait()) break; if (oldCount < 0 && m_count.compare_exchange_strong(oldCount, oldCount + 1, std::memory_order_relaxed, std::memory_order_relaxed)) return 0; } } } if (max > 1) return 1 + tryWaitMany(max - 1); return 1; } public: LightweightSemaphore(ssize_t initialCount = 0, int maxSpins = 10000) : m_count(initialCount), m_maxSpins(maxSpins) { assert(initialCount >= 0); assert(maxSpins >= 0); } bool tryWait() { ssize_t oldCount = m_count.load(std::memory_order_relaxed); while (oldCount > 0) { if (m_count.compare_exchange_weak(oldCount, oldCount - 1, std::memory_order_acquire, std::memory_order_relaxed)) return true; } return false; } bool wait() { return tryWait() || waitWithPartialSpinning(); } bool wait(std::int64_t timeout_usecs) { return tryWait() || waitWithPartialSpinning(timeout_usecs); } // Acquires between 0 and (greedily) max, inclusive ssize_t tryWaitMany(ssize_t max) { assert(max >= 0); ssize_t oldCount = m_count.load(std::memory_order_relaxed); while (oldCount > 0) { ssize_t newCount = oldCount > max ? oldCount - max : 0; if (m_count.compare_exchange_weak(oldCount, newCount, std::memory_order_acquire, std::memory_order_relaxed)) return oldCount - newCount; } return 0; } // Acquires at least one, and (greedily) at most max ssize_t waitMany(ssize_t max, std::int64_t timeout_usecs) { assert(max >= 0); ssize_t result = tryWaitMany(max); if (result == 0 && max > 0) result = waitManyWithPartialSpinning(max, timeout_usecs); return result; } ssize_t waitMany(ssize_t max) { ssize_t result = waitMany(max, -1); assert(result > 0); return result; } void signal(ssize_t count = 1) { assert(count >= 0); ssize_t oldCount = m_count.fetch_add(count, std::memory_order_release); ssize_t toRelease = -oldCount < count ? -oldCount : count; if (toRelease > 0) { m_sema.signal((int)toRelease); } } std::size_t availableApprox() const { ssize_t count = m_count.load(std::memory_order_relaxed); return count > 0 ? static_cast(count) : 0; } }; } // end namespace moodycamel ================================================ FILE: dispenso/thread_id.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include namespace dispenso { std::atomic nextThread{0}; constexpr uint64_t kInvalidThread = std::numeric_limits::max(); DISPENSO_THREAD_LOCAL uint64_t currentThread = kInvalidThread; uint64_t threadId() { if (currentThread == kInvalidThread) { currentThread = nextThread.fetch_add(uint64_t{1}, std::memory_order_relaxed); } return currentThread; } } // namespace dispenso ================================================ FILE: dispenso/thread_id.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * @file thread_id.h * @ingroup group_util * Utilities for getting a unique thread identifier. **/ #pragma once #include namespace dispenso { /** * Get the current thread's identifier, unique within the current process. * * @return An integer representing the current thread. * * @note Thread IDs are assumed to not be reused over the lifetime of a process, but this should * still enable processes running for thousands of years, even with very poor spawn/kill thread * patterns. * * @note If thread ID is needed for cross-process synchronization, one must fall back on * system-specific thread IDs. **/ DISPENSO_DLL_ACCESS uint64_t threadId(); } // namespace dispenso ================================================ FILE: dispenso/thread_pool.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include #include #if defined DISPENSO_DEBUG #include #endif // DISPENSO_DEBUG namespace dispenso { namespace { size_t getAdjustedThreadCount(size_t requested) { static const size_t maxThreads = []() { size_t maxT = std::numeric_limits::max(); #if defined(_WIN32) && !defined(__MINGW32__) #pragma warning(push) #pragma warning(disable : 4996) #endif char* envThreads = std::getenv("DISPENSO_MAX_THREADS_PER_POOL"); #if defined(_WIN32) && !defined(__MINGW32__) #pragma warning(pop) #endif if (envThreads) { char* end = nullptr; maxT = std::strtoul(envThreads, &end, 10); #if defined DISPENSO_DEBUG std::cout << "DISPENSO_MAX_THREADS_PER_POOL = " << maxT << std::endl; #endif // DISPENSO_DEBUG } return maxT; }(); return std::min(requested, maxThreads); } } // namespace void ThreadPool::PerThreadData::setThread(std::thread&& t) { thread_ = std::move(t); } void ThreadPool::PerThreadData::stop() { running_.store(false, std::memory_order_release); } uint32_t ThreadPool::wait(uint32_t currentEpoch) { if (sleepLengthUs_ > 0) { return epochWaiter_.waitFor(currentEpoch, sleepLengthUs_.load(std::memory_order_acquire)); } else { return epochWaiter_.current(); } } void ThreadPool::wake() { epochWaiter_.bumpAndWake(); } inline bool ThreadPool::PerThreadData::running() { return running_.load(std::memory_order_acquire); } ThreadPool::ThreadPool(size_t n, size_t poolLoadMultiplier) : poolLoadMultiplier_(poolLoadMultiplier), poolLoadFactor_(static_cast(getAdjustedThreadCount(n) * poolLoadMultiplier)), numThreads_(static_cast(getAdjustedThreadCount(n))) { detail::registerFineSchedulerQuanta(); #if defined DISPENSO_DEBUG assert(poolLoadMultiplier > 0); #endif // DISPENSO_DEBUG for (size_t i = 0; i < static_cast(numThreads_); ++i) { threads_.emplace_back(); threads_.back().setThread(std::thread([this, &back = threads_.back()]() { threadLoop(back); })); } } ThreadPool::PerThreadData::~PerThreadData() {} void ThreadPool::threadLoop(PerThreadData& data) { // On Windows, wake syscalls (WakeByAddressSingle) are expensive per-thread // kernel transitions, so spin longer before sleeping to avoid costly // sleep/wake cycles. #if defined(_WIN32) static constexpr int kBackoffYield = 100; static constexpr int kBackoffSleep = kBackoffYield + 20; #else static constexpr int kBackoffYield = 50; static constexpr int kBackoffSleep = kBackoffYield + 5; #endif moodycamel::ConsumerToken ctoken(work_); moodycamel::ProducerToken ptoken(work_); OnceFunction next; int failCount = 0; detail::PerPoolPerThreadInfo::registerPool(this, &ptoken); uint32_t epoch = epochWaiter_.current(); if (enableEpochWaiter_) { while (data.running()) { int localWorkDone = 0; while (true) { DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_BEGIN(); bool got = work_.try_dequeue(ctoken, next); DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_END(); if (!got) { break; } OnceFunction task(std::move(next)); task(); ++localWorkDone; if (localWorkDone >= kWorkBatchSize) { workRemaining_.fetch_sub(localWorkDone, std::memory_order_relaxed); localWorkDone = 0; } failCount = 0; } if (localWorkDone > 0) { workRemaining_.fetch_sub(localWorkDone, std::memory_order_relaxed); } ++failCount; detail::cpuRelax(); if (failCount > kBackoffSleep) { numSleeping_.fetch_add(1, std::memory_order_acq_rel); epoch = wait(epoch); numSleeping_.fetch_sub(1, std::memory_order_acq_rel); failCount = 0; } else if (failCount > kBackoffYield) { std::this_thread::yield(); } } } else { while (data.running()) { while (true) { DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_BEGIN(); bool got = work_.try_dequeue(ctoken, next); DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_END(); if (!got) { break; } executeNext(std::move(next)); failCount = 0; } ++failCount; detail::cpuRelax(); if (failCount > kBackoffSleep) { epoch = wait(epoch); } else if (failCount > kBackoffYield) { std::this_thread::yield(); } } } } void ThreadPool::resizeLocked(ssize_t sn) { sn = getAdjustedThreadCount(sn); assert(sn >= 0); size_t n = static_cast(sn); if (n < threads_.size()) { for (size_t i = n; i < threads_.size(); ++i) { threads_[i].stop(); } while (threads_.size() > n) { wake(); threads_.back().thread_.join(); threads_.pop_back(); } } else if (n > threads_.size()) { for (size_t i = threads_.size(); i < n; ++i) { threads_.emplace_back(); threads_.back().setThread( std::thread([this, &back = threads_.back()]() { threadLoop(back); })); } } poolLoadFactor_.store(static_cast(n * poolLoadMultiplier_), std::memory_order_relaxed); numThreads_.store(sn, std::memory_order_relaxed); if (!sn) { // Pool will run future tasks inline since we have no threads, but we still need to empty // current set of tasks while (tryExecuteNext()) { } } } ThreadPool::~ThreadPool() { #if defined DISPENSO_DEBUG assert(outstandingTaskSets_.load(std::memory_order_acquire) == 0); #endif // DISPENSO_DEBUG // Strictly speaking, it is unnecessary to lock this in the destructor; however, it could be a // useful diagnostic to learn that the mutex is already locked when we reach this point. std::unique_lock lk(threadsMutex_, std::try_to_lock); assert(lk.owns_lock()); // Mark all threads as stopped first, then wake them all at once. The previous // approach (stop + wake one at a time) was fragile: a wake() could reach an // already-awake thread while another thread remained sleeping, forcing it to // wait for the epoch timeout (e.g. 30ms) to notice the stop flag. Under TSAN // this caused severe slowdowns in ThreadPool destruction. for (auto& t : threads_) { t.stop(); } ssize_t numThreads = static_cast(threads_.size()); if (numThreads > 0) { wakeN(numThreads); } while (tryExecuteNext()) { } for (auto& t : threads_) { t.thread_.join(); } threads_.clear(); while (tryExecuteNext()) { } } ThreadPool& globalThreadPool() { // It should be illegal to access globalThreadPool after exiting main. // We default to hardware threads minus one because the calling thread usually is involved in // computation. static ThreadPool pool(std::thread::hardware_concurrency() - 1); return pool; } void resizeGlobalThreadPool(size_t numThreads) { globalThreadPool().resize(static_cast(numThreads)); } void ThreadPool::wakeN(ssize_t n) { #if defined(_WIN32) // On Windows, always use WakeAll. WakeByAddressSingle has no batch wake // count, so multiple calls means O(N) kernel transitions. A single WakeAll // is one transition and keeps threads in their spin phase (kBackoffYield // iterations) where they can absorb follow-up work without re-waking. (void)n; epochWaiter_.bumpAndWakeAll(); #else ssize_t sleeping = numSleeping_.load(std::memory_order_relaxed); constexpr unsigned kWakeAllMultiplier = 2; if (static_cast(n) * kWakeAllMultiplier >= static_cast(sleeping)) { epochWaiter_.bumpAndWakeAll(); } else { epochWaiter_.bumpAndWakeN(static_cast(n), static_cast(sleeping)); } #endif } } // namespace dispenso ================================================ FILE: dispenso/thread_pool.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * @file thread_pool.h * @ingroup group_core * A file providing ThreadPool. This is the heart of dispenso. All other scheduling paradigms, * including TaskSets, Futures, pipelines, and parallel loops, are built on top of ThreadPool. **/ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace dispenso { #if !defined(DISPENSO_WAKEUP_ENABLE) #if defined(_WIN32) || defined(__linux__) || defined(__MACH__) #define DISPENSO_WAKEUP_ENABLE 1 #else #define DISPENSO_WAKEUP_ENABLE 0 #endif // platform #endif // DISPENSO_WAKEUP_ENABLE #if !defined(DISPENSO_POLL_PERIOD_US) #if defined(_WIN32) #define DISPENSO_POLL_PERIOD_US 1000 #else #if !(DISPENSO_WAKEUP_ENABLE) #define DISPENSO_POLL_PERIOD_US 200 #else #define DISPENSO_POLL_PERIOD_US (1 << 15) // Determined empirically good on dual Xeon Linux #endif // DISPENSO_WAKEUP_ENABLE #endif // PLATFORM #endif // DISPENSO_POLL_PERIOD_US constexpr uint32_t kDefaultSleepLenUs = DISPENSO_POLL_PERIOD_US; constexpr bool kDefaultWakeupEnable = DISPENSO_WAKEUP_ENABLE; /** * A simple tag specifier that can be fed to TaskSets and * ThreadPools to denote that the current thread should never immediately execute a functor, but * rather, the functor should always be placed in the ThreadPool's queue. **/ struct ForceQueuingTag {}; /** * The basic executor for dispenso. It provides typical thread pool functionality, plus allows work * stealing by related types (e.g. TaskSet, Future, etc...), which prevents deadlock when waiting * for pool-recursive tasks. */ class alignas(kCacheLineSize) ThreadPool { public: /** * Construct a thread pool. * * @param n The number of threads to spawn at construction. * @param poolLoadMultiplier A parameter that specifies how overloaded the pool should be before * allowing the current thread to self-steal work. **/ DISPENSO_DLL_ACCESS ThreadPool(size_t n, size_t poolLoadMultiplier = 32); /** * Enable or disable signaling wake functionality. If enabled, this will try to ensure that * threads are woken up proactively when work has not been available and it becomes available. * This function is blocking and potentially very slow. Repeated use is discouraged. * * @param enable If set true, turns on signaling wake. If false, turns it off. * @param sleepDuration If enable is true, this is the length of time a thread will wait for a * signal before waking up. If enable is false, this is the length of time a thread will sleep * between polling. * * @note It is highly recommended to leave signaling wake enabled on Windows platforms, as * sleeping/polling tends to perform poorly for intermittent workloads. For Mac/Linux platforms, * it is okay to enable signaling wake, particularly if you wish to set a longer expected duration * between work. If signaling wake is disabled, ensure sleepDuration is small (e.g. 200us) for * best performance. Most users will not need to call this function, as defaults are reasonable. * * **/ template void setSignalingWake( bool enable, const std::chrono::duration& sleepDuration = std::chrono::microseconds(kDefaultSleepLenUs)) { setSignalingWake( enable, static_cast( std::chrono::duration_cast(sleepDuration).count())); } /** * Change the number of threads backing the thread pool. This is a blocking and potentially * slow operation, and repeatedly resizing is discouraged. * * @param n The number of threads in use after call completion **/ DISPENSO_DLL_ACCESS void resize(ssize_t n) { std::lock_guard lk(threadsMutex_); resizeLocked(n); } /** * Get the number of threads backing the pool. If called concurrently to resize, the * number returned may be stale. * * @return The current number of threads backing the pool. **/ ssize_t numThreads() const { return numThreads_.load(std::memory_order_relaxed); } /** * Schedule a functor to be executed. If the pool's load factor is high, execution may happen * inline by the calling thread. * * @param f The functor to be executed. f's signature must match void(). Best * performance will come from passing lambdas, other concrete functors, or OnceFunction, but * std::function or similarly type-erased objects will also work. **/ template DISPENSO_REQUIRES(OnceCallableFunc) void schedule(F&& f); /** * Schedule a functor to be executed. The functor will always be queued and executed by pool * threads. * * @param f The functor to be executed. f's signature must match void(). Best * performance will come from passing lambdas, other concrete functors, or OnceFunction, but * std::function or similarly type-erased objects will also work. **/ template DISPENSO_REQUIRES(OnceCallableFunc) void schedule(F&& f, ForceQueuingTag); /** * Schedule multiple functors to be executed in bulk. This is more efficient than calling * schedule() multiple times when you have many tasks to submit, as it reduces atomic contention * and allows for better thread wakeup behavior. * * @param count The number of functors to schedule. * @param gen A generator functor that takes an index and returns a functor to execute. * gen(i) will be called for i in [0, count) to produce each task. * * @note Work is enqueued in chunks and interleaved with inline execution based on the pool's * load factor, preventing both pool thread starvation and excessive queueing overhead. **/ template void scheduleBulk(size_t count, Generator&& gen); /** * Destruct the pool. This destructor is blocking until all queued work is completed. It is * illegal to call the destructor while any other thread makes calls to the pool (as is generally * the case with C++ classes). **/ DISPENSO_DLL_ACCESS ~ThreadPool(); private: class PerThreadData { public: void setThread(std::thread&& t); bool running(); void stop(); ~PerThreadData(); public: alignas(kCacheLineSize) std::thread thread_; std::atomic running_{true}; }; DISPENSO_DLL_ACCESS uint32_t wait(uint32_t priorEpoch); DISPENSO_DLL_ACCESS void wake(); DISPENSO_DLL_ACCESS void wakeN(ssize_t n); void setSignalingWake(bool enable, uint32_t sleepDurationUs) { std::lock_guard lk(threadsMutex_); ssize_t currentPoolSize = numThreads(); resizeLocked(0); enableEpochWaiter_.store(enable, std::memory_order_release); sleepLengthUs_.store(sleepDurationUs, std::memory_order_release); resizeLocked(currentPoolSize); } DISPENSO_DLL_ACCESS void resizeLocked(ssize_t n); void executeNext(OnceFunction work); DISPENSO_DLL_ACCESS void threadLoop(PerThreadData& threadData); bool tryExecuteNext(); bool tryExecuteNextFromProducerToken(moodycamel::ProducerToken& token); template void schedule(moodycamel::ProducerToken& token, F&& f); template void schedule(moodycamel::ProducerToken& token, F&& f, ForceQueuingTag); // Core bulk enqueue: unconditionally stage, enqueue, and wake for a chunk of tasks. // Caller is responsible for load factor checks. Count should be small (e.g. <= 2*numThreads). // When a producer token is provided, uses token-based enqueue for better throughput. template void scheduleBulkEnqueue(size_t count, Generator&& gen, moodycamel::ProducerToken* token = nullptr); // The non-atomic read of two separate atomics (numSleeping_ and workRemaining_) can // race, potentially causing a missed wake in rare cases. This is benign: the epoch // waiter's sleep timeout (sleepLengthUs_) provides a safety net, so a missed wake only // delays wakeup by up to that duration, never causes a hang. Subsequent schedule() // calls will trigger wakes. void conditionallyWake() { if (enableEpochWaiter_.load(std::memory_order_acquire)) { ssize_t sleeping = numSleeping_.load(std::memory_order_acquire); if (sleeping > 0) { // Only wake if there's more pending work than awake threads can handle. // Don't count the caller — it may be a pool thread that will continue // dequeuing its own work, not processing what it just submitted. ssize_t awake = numThreads_.load(std::memory_order_relaxed) - sleeping; ssize_t pending = workRemaining_.load(std::memory_order_relaxed); if (pending > awake) { wake(); } } } } public: // If we are not yet C++17, we provide aligned new/delete to avoid false sharing. #if __cplusplus < 201703L static void* operator new(size_t sz) { return detail::alignedMalloc(sz); } static void operator delete(void* ptr) { return detail::alignedFree(ptr); } #endif // __cplusplus private: // Number of tasks each thread accumulates before flushing workRemaining_. // This batching reduces atomic contention in threadLoop, but inflates // workRemaining_ by up to kWorkBatchSize * numThreads, so poolLoadFactor_ // must be reduced accordingly for accurate load-shedding in schedule(). static constexpr int kWorkBatchSize = 8; mutable std::mutex threadsMutex_; std::deque threads_; size_t poolLoadMultiplier_; // These atomics are read frequently in the hot schedule() path, so they need // cache-line alignment to avoid false sharing with the mutex/deque above. alignas(kCacheLineSize) std::atomic poolLoadFactor_; std::atomic numThreads_; moodycamel::ConcurrentQueue work_; alignas(kCacheLineSize) std::atomic numSleeping_{0}; alignas(kCacheLineSize) std::atomic workRemaining_{0}; alignas(kCacheLineSize) detail::EpochWaiter epochWaiter_; alignas(kCacheLineSize) std::atomic enableEpochWaiter_{kDefaultWakeupEnable}; std::atomic sleepLengthUs_{kDefaultSleepLenUs}; #if defined DISPENSO_DEBUG alignas(kCacheLineSize) std::atomic outstandingTaskSets_{0}; #endif // NDEBUG friend class ConcurrentTaskSet; friend class TaskSet; friend class TaskSetBase; }; /** * Get access to the global thread pool. * * @return the global thread pool **/ DISPENSO_DLL_ACCESS ThreadPool& globalThreadPool(); /** * Change the number of threads backing the global thread pool. * * @param numThreads The number of threads to back the global thread pool. **/ DISPENSO_DLL_ACCESS void resizeGlobalThreadPool(size_t numThreads); // ----------------------------- Implementation details ------------------------------------- template DISPENSO_REQUIRES(OnceCallableFunc) inline void ThreadPool::schedule(F&& f) { ssize_t curWork = workRemaining_.load(std::memory_order_relaxed); ssize_t quickLoadFactor = numThreads_.load(std::memory_order_relaxed); quickLoadFactor += quickLoadFactor / 2; if ((detail::PerPoolPerThreadInfo::isPoolRecursive(this) && curWork > quickLoadFactor) || (curWork > poolLoadFactor_.load(std::memory_order_relaxed))) { f(); } else { schedule(std::forward(f), ForceQueuingTag()); } } template DISPENSO_REQUIRES(OnceCallableFunc) inline void ThreadPool::schedule(F&& f, ForceQueuingTag) { if (auto* token = static_cast(detail::PerPoolPerThreadInfo::producer(this))) { schedule(*token, std::forward(f), ForceQueuingTag()); return; } if (!numThreads_.load(std::memory_order_relaxed)) { f(); return; } workRemaining_.fetch_add(1, std::memory_order_release); DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_BEGIN(); bool enqueued = work_.enqueue({std::forward(f)}); DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_END(); (void)(enqueued); // unused assert(enqueued); conditionallyWake(); } template inline void ThreadPool::schedule(moodycamel::ProducerToken& token, F&& f) { ssize_t curWork = workRemaining_.load(std::memory_order_relaxed); ssize_t quickLoadFactor = numThreads_.load(std::memory_order_relaxed); quickLoadFactor += quickLoadFactor / 2; if ((detail::PerPoolPerThreadInfo::isPoolRecursive(this) && curWork > quickLoadFactor) || (curWork > poolLoadFactor_.load(std::memory_order_relaxed))) { f(); } else { schedule(token, std::forward(f), ForceQueuingTag()); } } template inline void ThreadPool::schedule(moodycamel::ProducerToken& token, F&& f, ForceQueuingTag) { if (!numThreads_.load(std::memory_order_relaxed)) { f(); return; } workRemaining_.fetch_add(1, std::memory_order_release); DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_BEGIN(); bool enqueued = work_.enqueue(token, {std::forward(f)}); DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_END(); (void)(enqueued); // unused assert(enqueued); conditionallyWake(); } inline bool ThreadPool::tryExecuteNext() { OnceFunction next; DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_BEGIN(); bool dequeued = work_.try_dequeue(next); DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_END(); if (dequeued) { executeNext(std::move(next)); return true; } return false; } inline bool ThreadPool::tryExecuteNextFromProducerToken(moodycamel::ProducerToken& token) { OnceFunction next; if (work_.try_dequeue_from_producer(token, next)) { executeNext(std::move(next)); return true; } return false; } inline void ThreadPool::executeNext(OnceFunction next) { next(); workRemaining_.fetch_add(-1, std::memory_order_relaxed); } namespace detail { // Generating iterator for scheduleBulkEnqueue. Produces OnceFunction objects // on-the-fly during enqueue_bulk, avoiding the need for a staging buffer. // moodycamel's enqueue_bulk uses single-pass input iterator semantics. template struct BulkGenIter { using difference_type = std::ptrdiff_t; using value_type = OnceFunction; using pointer = OnceFunction*; using reference = OnceFunction&; using iterator_category = std::input_iterator_tag; Generator* gen; size_t index; OnceFunction operator*() { return (*gen)(index); } BulkGenIter& operator++() { ++index; return *this; } BulkGenIter operator++(int) { BulkGenIter tmp = *this; ++index; return tmp; } }; } // namespace detail template void ThreadPool::scheduleBulkEnqueue( size_t count, Generator&& gen, moodycamel::ProducerToken* token) { detail::BulkGenIter::type> it{&gen, 0}; // Single atomic update + bulk enqueue workRemaining_.fetch_add(static_cast(count), std::memory_order_release); DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_BEGIN(); bool enqueued; if (token) { enqueued = work_.enqueue_bulk(*token, it, count); } else { enqueued = work_.enqueue_bulk(it, count); } DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_END(); (void)(enqueued); assert(enqueued); // Wake appropriate threads if (enableEpochWaiter_.load(std::memory_order_acquire)) { ssize_t sleeping = numSleeping_.load(std::memory_order_acquire); ssize_t toWake = std::min(sleeping, static_cast(count)); if (toWake > 0) { wakeN(toWake); } } } template void ThreadPool::scheduleBulk(size_t count, Generator&& gen) { if (count == 0) { return; } ssize_t numPool = numThreads_.load(std::memory_order_relaxed); if (!numPool) { // No threads in pool - execute all inline for (size_t i = 0; i < count; ++i) { gen(i)(); } return; } // Process in chunks, interleaving enqueue and inline execution based on load. size_t chunkSize = static_cast(numPool) + static_cast(numPool) / 2; if (chunkSize < 1) { chunkSize = 1; } size_t i = 0; while (i < count) { ssize_t curWork = workRemaining_.load(std::memory_order_relaxed); ssize_t loadFactor = poolLoadFactor_.load(std::memory_order_relaxed); if (curWork > loadFactor) { // Over load factor - execute one task inline, then re-check gen(i)(); ++i; } else { // Under load factor - enqueue a chunk ssize_t room = loadFactor - curWork; size_t toEnqueue = std::min({count - i, chunkSize, static_cast(room)}); if (toEnqueue == 0) { toEnqueue = 1; } size_t base = i; scheduleBulkEnqueue(toEnqueue, [&gen, base](size_t j) { return gen(base + j); }); i += toEnqueue; } } } } // namespace dispenso ================================================ FILE: dispenso/timed_task.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include namespace dispenso { TimedTaskScheduler::TimedTaskScheduler(ThreadPriority prio) : priority_(prio) { thread_ = std::thread([this, prio]() { detail::registerFineSchedulerQuanta(); if (!setCurrentThreadPriority(prio)) { std::cerr << "Couldn't set thread priority" << std::endl; } timeQueueRunLoop(); }); } TimedTaskScheduler::~TimedTaskScheduler() { { std::lock_guard lk(queueMutex_); running_ = false; } epoch_.bumpAndWake(); thread_.join(); } void TimedTaskScheduler::kickOffTask(std::shared_ptr next, double curTime) { size_t remaining = next->timesToRun.fetch_sub(1, std::memory_order_acq_rel); if (remaining == 1) { auto* np = next.get(); np->func(std::move(next)); } else if (remaining > 1) { next->func(next); if (next->steady) { next->nextAbsTime += next->period; } else { next->nextAbsTime = curTime + next->period; } std::lock_guard lk(queueMutex_); tasks_.push(std::move(next)); } } constexpr double kSmallTimeBuffer = 10e-6; void TimedTaskScheduler::timeQueueRunLoop() { #if defined(_WIN32) constexpr double kSpinYieldBuffer = 1e-3; constexpr double kSpinBuffer = 100e-6; #else constexpr double kSpinYieldBuffer = 500e-6; constexpr double kSpinBuffer = 50e-6; #endif // platform constexpr double kConvertToUs = 1e6; uint32_t curEpoch = epoch_.current(); while (true) { { std::unique_lock lk(queueMutex_); if (priority_ != getCurrentThreadPriority()) { setCurrentThreadPriority(priority_); } if (!running_) { break; } if (tasks_.empty()) { lk.unlock(); curEpoch = epoch_.wait(curEpoch); continue; } } double curTime = getTime(); double timeRemaining; std::unique_lock lk(queueMutex_); timeRemaining = tasks_.top()->nextAbsTime - curTime; if (timeRemaining < kSmallTimeBuffer) { auto next = tasks_.top(); tasks_.pop(); lk.unlock(); kickOffTask(std::move(next), curTime); } else if (timeRemaining < kSpinBuffer) { continue; } else if (timeRemaining < kSpinYieldBuffer) { lk.unlock(); std::this_thread::yield(); continue; } else { lk.unlock(); curEpoch = epoch_.waitFor( curEpoch, static_cast((timeRemaining - kSpinBuffer) * kConvertToUs)); } } } void TimedTaskScheduler::addTimedTask(std::shared_ptr task) { double curTime = getTime(); double timeRemaining; timeRemaining = task->nextAbsTime - curTime; if (timeRemaining < kSmallTimeBuffer) { kickOffTask(std::move(task), curTime); } else { std::lock_guard lk(queueMutex_); tasks_.push(std::move(task)); } epoch_.bumpAndWake(); } TimedTaskScheduler& globalTimedTaskScheduler() { static TimedTaskScheduler scheduler; return scheduler; } } // namespace dispenso ================================================ FILE: dispenso/timed_task.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * @file timed_task.h * @ingroup group_async * Utilities for delaying a task until a future time and periodic scheduling. **/ #pragma once #include #include #include #include #include #include #include namespace dispenso { /** * For periodic tasks, this type will describe the behavior of how tasks are scheduled **/ enum class TimedTaskType { kNormal ///< Schedule the next task at period plus the time the first or most recent task ran , kSteady ///< Schedule the next task at period plus the time the first or most recent task was ///< scheduled for }; /** * A timed task type. This encapsulates a function that will run at a time scheduled in the future, * and optionally periodically for a specified number of times. **/ class TimedTask { public: TimedTask(const TimedTask&) = delete; /** Move constructor. */ TimedTask(TimedTask&& other) : impl_(std::move(other.impl_)) {} TimedTask& operator=(const TimedTask&) = delete; TimedTask& operator=(TimedTask&& other) { impl_ = std::move(other.impl_); return *this; } /** * Cancel the task. No further runs to the underlying function will occur (though any calls in * progress will complete). **/ void cancel() { impl_->timesToRun.store(0, std::memory_order_release); impl_->flags.fetch_or(detail::kFFlagsCancelled, std::memory_order_release); } /** * Detach this task so that it cannot block in destruction, and destruction does not cancel * further runs to the underlying function. This is only to be used in cases where the underlying * schedulable and any function resources are expected to outlive any other copy of the task. **/ void detach() { impl_->flags.fetch_or(detail::kFFlagsDetached, std::memory_order_release); } /** * See how many calls there have been to the underlying function have completed thus far. * * @return the count of calls made this far to the underlying function. **/ size_t calls() const { return impl_->count.load(std::memory_order_acquire); } /** * Destroy the timed task. If detach() has not been called, this will cancel further calls to the * underlying scheduled function, and will wait if necessary until the function is no longer in * progress. This is to naturally protect resources that may be used by the function. If * detach() was called, the destructor will not block. * **/ ~TimedTask() { if (!impl_ || impl_->flags.load(std::memory_order_acquire) & detail::kFFlagsDetached) { return; } cancel(); while (impl_->inProgress.load(std::memory_order_acquire)) { } // Now we can safely destroy the underlying function. We do this here because we can't risk // that func may call code in it's destructor that may no longer be relevant after this // TimedTask destructor completes. impl_->func = {}; } private: template TimedTask( Schedulable& sched, F&& f, double nextRunAbs, double period = 0.0, size_t timesToRun = 1, TimedTaskType type = TimedTaskType::kNormal) : impl_( detail::make_shared( timesToRun, nextRunAbs, period, std::forward(f), sched, type == TimedTaskType::kSteady)) {} std::shared_ptr impl_; friend class TimedTaskScheduler; }; // namespace dispenso /** * A timed-task scheduler running on a single thread. This allows multiple schedulers if necessary, * if e.g. you wish to schedule against InlineInvoker backing schedulable and you're running * a lot of relatively quick timed tasks.Most people should just use the global timed task * scheduler. **/ class TimedTaskScheduler { public: /** * Create a TimedTaskScheduler with specified priority * * @param priority The priority to set. For highest level of periodicity accuracy, use kRealtime. * @note Priorities above kNormal should be used sparingly, and only short-running tasks should be * run inline to avoid bad OS responsivity. When using a ThreadPool Schedulable, tasks can be * long running. **/ DISPENSO_DLL_ACCESS explicit TimedTaskScheduler( ThreadPriority priority = ThreadPriority::kNormal); DISPENSO_DLL_ACCESS ~TimedTaskScheduler(); /** * Set the priority for the backing thread. See note for constructor. **/ void setPriority(ThreadPriority priority) { std::lock_guard lk(queueMutex_); priority_ = priority; } /** * Schedule a task * * @param sched A backing schedulable, such as ImmediateInvoker, NewThreadInvoker, TaskSet, or * ThreadPool to run the function in when it is scheduled. * @param func A bool() function to run when scheduled. The function may return true to indicate * it should continue to be scheduled, or false to cancel. * @param nextRunAbs The absolute time to run the function. Time scale is expected to match * getTime() for absolute times. * @param period The period in seconds. * @param timesToRun The number of times to run the function. After that number, the function * will not be called again. * @param type The type of periodicity (if any). **/ template TimedTask schedule( Schedulable& sched, F&& func, double nextRunAbs, double period = 0.0, size_t timesToRun = 1, TimedTaskType type = TimedTaskType::kNormal) { TimedTask task(sched, std::forward(func), nextRunAbs, period, timesToRun, type); addTimedTask(task.impl_); return task; } /** * Schedule a task to run once at a time in the future * * @param sched A backing schedulable, such as ImmediateInvoker, NewThreadInvoker, TaskSet, or * ThreadPool to run the function in when it is scheduled. * @param func A bool() function to run when scheduled. The function may return true to indicate * it should continue to be scheduled, or false to cancel. * @param timeInFuture the amount of time from current at which to schedule the function **/ template TimedTask schedule(Schedulable& sched, F&& func, const std::chrono::duration& timeInFuture) { return schedule(sched, std::forward(func), toNextRun(timeInFuture)); } /** * Schedule a task to run once at a time in the future * * @param sched A backing schedulable, such as ImmediateInvoker, NewThreadInvoker, TaskSet, or * ThreadPool to run the function in when it is scheduled. * @param func A bool() function to run when scheduled. The function may return true to indicate * it should continue to be scheduled, or false to cancel. * @param nextRunTime An absolute time to run the function. If in the past, func will run * immediately. **/ template TimedTask schedule( Schedulable& sched, F&& func, const std::chrono::time_point& nextRunTime) { return schedule(sched, std::forward(func), toNextRun(nextRunTime)); } /** * Schedule a task to run periodically * * @param sched A backing schedulable, such as ImmediateInvoker, NewThreadInvoker, TaskSet, or * ThreadPool to run the function in when it is scheduled. * @param func A bool() function to run when scheduled. The function may return true to indicate * it should continue to be scheduled, or false to cancel. * @param timeInFuture the amount of time from current at which to schedule the function * @param period The period defining the run frequency * @param timesToRun The number of times to run the function. After that number, the function * will not be called again. * @param type The type of periodicity (if any). **/ template TimedTask schedule( Schedulable& sched, F&& func, const std::chrono::duration& timeInFuture, const std::chrono::duration& period, size_t timesToRun = std::numeric_limits::max(), TimedTaskType type = TimedTaskType::kNormal) { return schedule( sched, std::forward(func), toNextRun(timeInFuture), toPeriod(period), timesToRun, type); } /** * Schedule a task to run periodically * * @param sched A backing schedulable, such as ImmediateInvoker, NewThreadInvoker, TaskSet, or * ThreadPool to run the function in when it is scheduled. * @param func A bool() function to run when scheduled. The function may return true to indicate * it should continue to be scheduled, or false to cancel. * @param nextRunTime An absolute time to run the function. If in the past, func will run * immediately. * @param period The period defining the run frequency * @param timesToRun The number of times to run the function. After that number, the function * will not be called again. * @param type The type of periodicity (if any). **/ template < typename Schedulable, typename Rep, typename Period, typename Clock, typename Duration, typename F> TimedTask schedule( Schedulable& sched, F&& func, const std::chrono::time_point& nextRunTime, const std::chrono::duration& period, size_t timesToRun = std::numeric_limits::max(), TimedTaskType type = TimedTaskType::kNormal) { return schedule( sched, std::forward(func), toNextRun(nextRunTime), toPeriod(period), timesToRun, type); } private: template static double toNextRun(const std::chrono::duration& timeInFuture) { return getTime() + std::chrono::duration(timeInFuture).count(); } template static double toNextRun(const std::chrono::time_point& nextRunTime) { auto curTime = Clock::now(); return toNextRun(nextRunTime - curTime); } template static double toPeriod(const std::chrono::duration& period) { return std::chrono::duration(period).count(); } DISPENSO_DLL_ACCESS void addTimedTask(std::shared_ptr task); void timeQueueRunLoop(); void kickOffTask(std::shared_ptr next, double curTime); struct Compare { bool operator()( const std::shared_ptr& a, const std::shared_ptr& b) const { return a->nextAbsTime > b->nextAbsTime; } }; // TODO(bbudge): Consider lock-free priority queue implementation. I'd expect it to be minimally // beneficial for this use case though... timed tasks should rarely be super-high contention. std::mutex queueMutex_; std::priority_queue< std::shared_ptr, std::vector>, Compare> tasks_; bool running_{true}; detail::EpochWaiter epoch_; std::thread thread_; ThreadPriority priority_; }; /** * Access the global timed task scheduler. Most applications should only require one scheduler. * * @return A reference to the single instance global timed task scheduler. **/ DISPENSO_DLL_ACCESS TimedTaskScheduler& globalTimedTaskScheduler(); } // namespace dispenso ================================================ FILE: dispenso/timing.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #if defined(_MSC_VER) #include #endif // _MSC_VER #if defined(_WIN32) #include #endif // _WIN32 #if defined(__MACH__) #include #include #endif // __MACH__ namespace dispenso { namespace { #if defined(__x86_64__) || defined(_M_AMD64) #define DISPENSO_HAS_TIMESTAMP #if defined(_MSC_VER) inline uint64_t rdtscp() { uint32_t ui; return __rdtscp(&ui); } #else inline uint64_t rdtscp() { uint32_t lo, hi; __asm__ volatile("rdtscp" : /* outputs */ "=a"(lo), "=d"(hi) : /* no inputs */ : /* clobbers */ "%rcx"); return (uint64_t)lo | (((uint64_t)hi) << 32); } #endif // OS #elif (defined(__GNUC__) || defined(__clang__)) && defined(__aarch64__) #define DISPENSO_HAS_TIMESTAMP uint64_t rdtscp(void) { uint64_t val; __asm__ volatile("mrs %0, cntvct_el0" : "=r"(val)); return val; } #endif // ARCH } // namespace #if defined(DISPENSO_HAS_TIMESTAMP) #if !defined(__aarch64__) static bool snapFreq(double& firstApprox) { switch (static_cast(firstApprox)) { case 0: if (std::abs(int(firstApprox * 10.0)) <= 1) { firstApprox = 0.0; return true; } break; case 9: if (std::abs(int(firstApprox * 10.0) - 99) <= 1) { firstApprox = 10.0; return true; } break; case 3: if (std::abs(int(firstApprox * 10.0) - 33) <= 1) { firstApprox = 3.0 + 1.0 / 3.0; return true; } break; case 6: if (std::abs(int(firstApprox * 10.0) - 66) <= 1) { firstApprox = 6.0 + 2.0 / 3.0; return true; } break; } return false; } static double fallbackTicksPerSecond() { using namespace std::chrono_literals; constexpr double kChronoOverheadBias = 250e-9; auto baseStart = std::chrono::high_resolution_clock::now(); auto start = rdtscp(); std::this_thread::sleep_for(50ms); auto end = rdtscp(); auto baseEnd = std::chrono::high_resolution_clock::now(); auto base = std::chrono::duration(baseEnd - baseStart).count() - kChronoOverheadBias; double firstApprox = (static_cast(end - start)) / base; // Try to refine the approximation. In some circumstances we can "snap" the frequency to a very // good guess that is off by less than one part in thousands. Accuracy should already be quite // good in any case, but this allows us to improve in some cases. // Get first 3 digits firstApprox *= 1e-7; int firstInt = static_cast(firstApprox); firstApprox -= firstInt; firstApprox *= 10.0; if (!snapFreq(firstApprox)) { int secondInt = static_cast(firstApprox); firstApprox -= secondInt; firstApprox *= 10.0; snapFreq(firstApprox); firstApprox *= 0.1; firstApprox += secondInt; } firstApprox *= 0.1; firstApprox += firstInt; firstApprox *= 1e7; return firstApprox; } #endif // !__aarch64__ #if defined(__aarch64__) static double ticksPerSecond() { uint64_t val; __asm__ volatile("mrs %0, cntfrq_el0" : "=r"(val)); return static_cast(val); } #elif defined(__MACH__) static double ticksPerSecond() { mach_timebase_info_data_t info; if (mach_timebase_info(&info) != KERN_SUCCESS) { return fallbackTicksPerSecond(); } return 1e9 * static_cast(info.denom) / static_cast(info.numer); } #else double ticksPerSecond() { return fallbackTicksPerSecond(); } #endif double getTime() { static double secondsPerTick = 1.0 / ticksPerSecond(); static double startTime = static_cast(rdtscp()) * secondsPerTick; double t = static_cast(rdtscp()) * secondsPerTick; return t - startTime; } #else double getTime() { static auto startTime = std::chrono::high_resolution_clock::now(); auto cur = std::chrono::high_resolution_clock::now(); return std::chrono::duration(cur - startTime).count(); } #endif // DISPENSO_HAS_TIMESTAMP namespace { // This should ensure that we initialize the time before main. double g_dummyTime = getTime(); } // namespace } // namespace dispenso ================================================ FILE: dispenso/timing.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * @file timing.h * @ingroup group_util * Utilities for getting the current time. **/ #pragma once #include namespace dispenso { /** * Get elapsed time in seconds since program start. * * Uses high-resolution timing when available (e.g., RDTSC on x86). * * @return Elapsed time in seconds as a double. */ DISPENSO_DLL_ACCESS double getTime(); } // namespace dispenso ================================================ FILE: dispenso/tsan_annotations.cpp ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include #if DISPENSO_HAS_TSAN #ifdef __GNUC__ #define ATTRIBUTE_WEAK __attribute__((weak)) #else #define ATTRIBUTE_WEAK #endif // These are found in the accompanying libtsan, but there is no header exposing them. We want to // also avoid exposing them in a header to to discourage folks from calling them directly. extern "C" { void AnnotateIgnoreReadsBegin(const char* f, int l) ATTRIBUTE_WEAK; void AnnotateIgnoreReadsEnd(const char* f, int l) ATTRIBUTE_WEAK; void AnnotateIgnoreWritesBegin(const char* f, int l) ATTRIBUTE_WEAK; void AnnotateIgnoreWritesEnd(const char* f, int l) ATTRIBUTE_WEAK; void AnnotateNewMemory(const char* f, int l, const volatile void* address, long size) ATTRIBUTE_WEAK; void AnnotateHappensBefore(const char* f, int l, const volatile void* address) ATTRIBUTE_WEAK; void AnnotateHappensAfter(const char* f, int l, const volatile void* address) ATTRIBUTE_WEAK; } namespace dispenso { namespace detail { void annotateIgnoreWritesBegin(const char* f, int l) { AnnotateIgnoreWritesBegin(f, l); } void annotateIgnoreWritesEnd(const char* f, int l) { AnnotateIgnoreWritesEnd(f, l); } void annotateIgnoreReadsBegin(const char* f, int l) { AnnotateIgnoreReadsBegin(f, l); } void annotateIgnoreReadsEnd(const char* f, int l) { AnnotateIgnoreReadsEnd(f, l); } void annotateNewMemory(const char* f, int l, const volatile void* address, long size) { AnnotateNewMemory(f, l, address, size); } void annotateHappensBefore(const char* f, int l, const volatile void* address) { AnnotateHappensBefore(f, l, address); } void annotateHappensAfter(const char* f, int l, const volatile void* address) { AnnotateHappensAfter(f, l, address); } } // namespace detail } // namespace dispenso #endif // TSAN ================================================ FILE: dispenso/tsan_annotations.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * @file tsan_annotations.h * @ingroup group_util * This file exposes a set of macros for ignoring tsan errors. These should generally not * be used just to shut up TSAN, because most of the time, TSAN reports real bugs. They should be * used only when there is a high level of certainty that TSAN is spitting out a false positive, as * can occasionally happen with lock-free algorithms. * * When these are required, it is best to keep the scope as small as possible to avoid blinding TSAN * to real bugs. Note that several libraries already expose macros like these, but we want to * keep dependencies to a bare minimum. **/ #pragma once #include #if defined(__SANITIZE_THREAD__) #define DISPENSO_HAS_TSAN 1 #elif defined(__has_feature) #if __has_feature(thread_sanitizer) #define DISPENSO_HAS_TSAN 1 #else #define DISPENSO_HAS_TSAN 0 #endif // TSAN #else #define DISPENSO_HAS_TSAN 0 #endif // feature #if DISPENSO_HAS_TSAN namespace dispenso { namespace detail { DISPENSO_DLL_ACCESS void annotateIgnoreWritesBegin(const char* f, int l); DISPENSO_DLL_ACCESS void annotateIgnoreWritesEnd(const char* f, int l); DISPENSO_DLL_ACCESS void annotateIgnoreReadsBegin(const char* f, int l); DISPENSO_DLL_ACCESS void annotateIgnoreReadsEnd(const char* f, int l); DISPENSO_DLL_ACCESS void annotateNewMemory(const char* f, int l, const volatile void* address, long size); DISPENSO_DLL_ACCESS void annotateHappensBefore(const char* f, int l, const volatile void* address); DISPENSO_DLL_ACCESS void annotateHappensAfter(const char* f, int l, const volatile void* address); } // namespace detail } // namespace dispenso #define DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_BEGIN() \ ::dispenso::detail::annotateIgnoreWritesBegin(__FILE__, __LINE__) #define DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_END() \ ::dispenso::detail::annotateIgnoreWritesEnd(__FILE__, __LINE__) #define DISPENSO_TSAN_ANNOTATE_IGNORE_READS_BEGIN() \ ::dispenso::detail::annotateIgnoreReadsBegin(__FILE__, __LINE__) #define DISPENSO_TSAN_ANNOTATE_IGNORE_READS_END() \ ::dispenso::detail::annotateIgnoreReadsEnd(__FILE__, __LINE__) #define DISPENSO_TSAN_ANNOTATE_NEW_MEMORY(address, size) \ ::dispenso::detail::annotateNewMemory(__FILE__, __LINE__, address, size) #define DISPENSO_TSAN_ANNOTATE_HAPPENS_BEFORE(address) \ ::dispenso::detail::annotateHappensBefore(__FILE__, __LINE__, address) #define DISPENSO_TSAN_ANNOTATE_HAPPENS_AFTER(address) \ ::dispenso::detail::annotateHappensAfter(__FILE__, __LINE__, address) #else #define DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_BEGIN() #define DISPENSO_TSAN_ANNOTATE_IGNORE_WRITES_END() #define DISPENSO_TSAN_ANNOTATE_IGNORE_READS_BEGIN() #define DISPENSO_TSAN_ANNOTATE_IGNORE_READS_END() #define DISPENSO_TSAN_ANNOTATE_NEW_MEMORY(address, size) #define DISPENSO_TSAN_ANNOTATE_HAPPENS_BEFORE(address) #define DISPENSO_TSAN_ANNOTATE_HAPPENS_AFTER(address) #endif // DISPENSO_HAS_TSAN ================================================ FILE: dispenso/util.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ /** * @file util.h * @ingroup group_util * A collection of utility functions and types for memory alignment, bit manipulation, * and performance optimization. * * @note The constant `kCacheLineSize` used by several utilities here is defined in * ``, which is included by this header. **/ #pragma once #include #include #include namespace dispenso { /** * @brief Allocate memory with a specified alignment. * * This function allocates memory aligned to the specified boundary. The alignment * must be a power of 2 and at least sizeof(uintptr_t). * * @param bytes The number of bytes to allocate * @param alignment The alignment requirement in bytes (must be power of 2) * @return Pointer to aligned memory, or nullptr on allocation failure * * @note Memory allocated with alignedMalloc must be freed with alignedFree * @see alignedFree * * Example: * @code * void* ptr = dispenso::alignedMalloc(1024, 64); // 64-byte aligned * // ... use ptr ... * dispenso::alignedFree(ptr); * @endcode */ inline void* alignedMalloc(size_t bytes, size_t alignment) { return detail::alignedMalloc(bytes, alignment); } /** * @brief Allocate memory aligned to cache line size. * * This is a convenience overload that aligns to kCacheLineSize (typically 64 bytes), * which helps avoid false sharing in concurrent data structures. * * @param bytes The number of bytes to allocate * @return Pointer to cache-line aligned memory, or nullptr on allocation failure * * @note Memory allocated with alignedMalloc must be freed with alignedFree * @see alignedFree, kCacheLineSize */ inline void* alignedMalloc(size_t bytes) { return detail::alignedMalloc(bytes); } /** * @brief Free memory allocated by alignedMalloc. * * @param ptr Pointer to memory allocated by alignedMalloc (can be nullptr) * * @see alignedMalloc */ inline void alignedFree(void* ptr) { detail::alignedFree(ptr); } /** * @brief Deleter for smart pointers that use aligned memory allocation. * * This deleter calls the destructor and frees memory allocated with alignedMalloc. * It can be used with std::unique_ptr and std::shared_ptr. * * @tparam T The type being deleted * * Example: * @code * using AlignedPtr = std::unique_ptr>; * void* mem = dispenso::alignedMalloc(sizeof(MyType), 64); * AlignedPtr ptr(new (mem) MyType(), dispenso::AlignedDeleter()); * @endcode */ template using AlignedDeleter = detail::AlignedFreeDeleter; /** * @brief Align a value up to the next cache line boundary. * * Rounds up the input value to the next multiple of kCacheLineSize. Useful for * manual memory layout to avoid false sharing. * * @param val The value to align * @return Value aligned to next cache line boundary * * @see kCacheLineSize * * Example: * @code * size_t offset = dispenso::alignToCacheLine(37); // Returns 64 * @endcode */ inline constexpr uintptr_t alignToCacheLine(uintptr_t val) { return detail::alignToCacheLine(val); } /** * @brief CPU relaxation hint for spin loops. * * Emits a platform-specific instruction (PAUSE on x86, YIELD on ARM) to improve * spin loop performance and reduce power consumption. Use this in busy-wait loops * to be friendlier to hyper-threading and the CPU pipeline. * * @note This is a no-op on platforms without a specific relax instruction * * Example: * @code * while (!flag.load(std::memory_order_acquire)) { * dispenso::cpuRelax(); // Be nice to the CPU * } * @endcode */ inline void cpuRelax() { detail::cpuRelax(); } /** * @brief Round up to the next power of 2. * * Computes the smallest power of 2 that is greater than or equal to the input value. * * @param v Input value * @return Next power of 2 (or v if v is already a power of 2) * * @note Returns 0 if v is 0 * * Example: * @code * static_assert(dispenso::nextPow2(17) == 32); * static_assert(dispenso::nextPow2(64) == 64); * @endcode */ constexpr uint64_t nextPow2(uint64_t v) { return detail::nextPow2(v); } /** * @brief Compute log base 2 of a value (compile-time). * * Computes floor(log2(v)) at compile time. Useful for template metaprogramming * and constexpr contexts. * * @param v Input value (must be > 0) * @return floor(log2(v)) * * @note Behavior is undefined if v is 0 * * Example: * @code * static_assert(dispenso::log2const(64) == 6); * static_assert(dispenso::log2const(100) == 6); * @endcode */ constexpr inline uint32_t log2const(uint64_t v) { return detail::log2const(v); } /** * @brief Compute log base 2 of a value (runtime). * * Computes floor(log2(v)) using platform-specific intrinsics for optimal performance. * On x86/x64, uses __builtin_clzll or __lzcnt64. Falls back to constexpr version * on other platforms. * * @param v Input value (must be > 0) * @return floor(log2(v)) * * @note Behavior is undefined if v is 0 * * Example: * @code * uint32_t power = dispenso::log2(size); // Fast bit scan * @endcode */ inline uint32_t log2(uint64_t v) { return detail::log2(v); } /** * @brief Buffer with proper alignment for type T. * * Provides uninitialized storage with proper alignment for type T. Useful for * manual object lifetime management or placement new scenarios. * * @tparam T The type to provide storage for * * Example: * @code * dispenso::AlignedBuffer buf; * MyType* obj = new (buf.b) MyType(); * obj->~MyType(); * @endcode */ template using AlignedBuffer = detail::AlignedBuffer; /** * @brief Cache-line aligned atomic pointer. * * An atomic pointer aligned to cache line boundary to avoid false sharing. * Inherits from std::atomic. * * @tparam T The pointed-to type * * Example: * @code * dispenso::AlignedAtomic ptr; * ptr.store(new int(42)); * @endcode */ template using AlignedAtomic = detail::AlignedAtomic; /** * @brief Optional-like storage with in-place construction (C++14 compatible). * * Provides similar functionality to std::optional but works in C++14 codebases. * Stores the value in-place and tracks whether a value is present. * * @tparam T The type to store * * @note For small types, this has ~2x the size overhead of std::optional (pointer vs bool flag). * For larger types, the overhead is similar. Prefer std::optional in C++17 and later. * * Example: * @code * dispenso::OpResult result; * if (success) { * result.emplace(42); * } * if (result) { * use(result.value()); * } * @endcode */ template using OpResult = detail::OpResult; /** * @brief Information for statically chunking a range across threads. * * When dividing work into static chunks, using a simple chunk size plus remainder * can lead to poor load balancing. This struct provides the optimal chunking strategy * where some tasks get ceil(items/chunks) work and others get floor(items/chunks). */ using StaticChunking = detail::StaticChunking; /** * @brief Compute optimal static chunking for load balancing. * * Divides items into chunks such that the work is distributed as evenly as possible. * Returns chunking info where some tasks get ceil(items/chunks) and others get * floor(items/chunks). * * @param items Total number of items to process * @param chunks Number of chunks to divide into (must be > 0) * @return StaticChunking information for distributing the work * * Example: * @code * auto chunking = dispenso::staticChunkSize(100, 8); * // First 4 threads get 13 items each, last 4 get 12 items each * // 4*13 + 4*12 = 52 + 48 = 100 * @endcode */ inline StaticChunking staticChunkSize(ssize_t items, ssize_t chunks) { return detail::staticChunkSize(items, chunks); } } // namespace dispenso ================================================ FILE: dispenso/utils/graph_dot.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include namespace detail { inline std::string getName( const void* ptr, const size_t index, const std::unordered_map* nodeNames) { const uintptr_t key = reinterpret_cast(ptr); if (nodeNames) { auto it = nodeNames->find(key); if (it != nodeNames->end()) { return it->second; } } return std::to_string(index); } } // namespace detail namespace dispenso { template void graphsToDot( const char* filename, const G& graph, const std::unordered_map* nodeNames) { using SubgraphType = typename G::SubgraphType; using NodeType = typename G::NodeType; std::ofstream datfile(filename); datfile << R"dot(digraph { rankdir = LR node [shape = rectangle, style = filled, colorscheme=pastel19] graph [style = filled, color = Gray95] subgraph cluster_l { label = "Legend"; style=solid; color=black empty1 [style = invis, shape=point] empty2 [style = invis, shape=point] incomplete [color = 1] completed [color = 2] incomplete -> empty1 [label = "normal"] completed -> empty2 [arrowhead = onormal,label = "bidirectional\lpropagation"] } )dot"; const size_t numSubgraphs = graph.numSubgraphs(); for (size_t i = 0; i < numSubgraphs; ++i) { const SubgraphType& s = graph.subgraph(i); if (i != 0) { datfile << " " << "subgraph cluster_" << i << " { label = \"" << ::detail::getName(&s, i, nodeNames) << "\"\n"; } const size_t numNodes = s.numNodes(); for (size_t j = 0; j < numNodes; ++j) { const NodeType& node = s.node(j); datfile << " " << reinterpret_cast(&node) << " [color = " << (node.isCompleted() ? 2 : 1); datfile << " label = \"" << ::detail::getName(&node, j, nodeNames) << "\"]\n"; } if (i != 0) { datfile << " }\n"; } } graph.forEachNode([&](const NodeType& node) { node.forEachDependent([&](const dispenso::Node& d) { datfile << " " << reinterpret_cast(&node) << " -> " << reinterpret_cast(&d); if (std::is_same::value) { const auto& node1 = static_cast(node); const auto& node2 = static_cast(d); datfile << (node1.isSameSet(node2) ? "[arrowhead=onormal]" : ""); } datfile << '\n'; }); }); datfile << "}"; datfile.close(); } } // namespace dispenso ================================================ FILE: docs/Doxyfile ================================================ # Doxyfile 1.8.14 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project. # # All text after a double hash (##) is considered a comment and is placed in # front of the TAG it is preceding. # # All text after a single hash (#) is considered a comment and will be ignored. # The format is: # TAG = value [value, ...] # For lists, items can also be appended using: # TAG += value [value, ...] # Values that contain spaces should be placed between quotes (\" \"). #--------------------------------------------------------------------------- # Project related configuration options #--------------------------------------------------------------------------- # This tag specifies the encoding used for all characters in the config file # that follow. The default is UTF-8 which is also the encoding used for all text # before the first occurrence of this tag. Doxygen uses libiconv (or the iconv # built into libc) for the transcoding. See # https://www.gnu.org/software/libiconv/ for the list of possible encodings. # The default value is: UTF-8. DOXYFILE_ENCODING = UTF-8 # The PROJECT_NAME tag is a single word (or a sequence of words surrounded by # double-quotes, unless you are using Doxywizard) that should identify the # project for which the documentation is generated. This name is used in the # title of most generated pages and in a few other places. # The default value is: My Project. PROJECT_NAME = dispenso # The PROJECT_NUMBER tag can be used to enter a project or revision number. This # could be handy for archiving the generated documentation or if some version # control system is used. PROJECT_NUMBER = 1.5.1 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a # quick idea about the purpose of the project. Keep the description short. PROJECT_BRIEF = "A library for task parallelism" # With the PROJECT_LOGO tag one can specify a logo or an icon that is included # in the documentation. The maximum height of the logo should not exceed 55 # pixels and the maximum width should not exceed 200 pixels. Doxygen will copy # the logo to the output directory. PROJECT_LOGO = # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path # into which the generated documentation will be written. If a relative path is # entered, it will be relative to the location where doxygen was started. If # left blank the current directory will be used. OUTPUT_DIRECTORY = ./doxygen # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- # directories (in 2 levels) under the output directory of each output format and # will distribute the generated files over these directories. Enabling this # option can be useful when feeding doxygen a huge amount of source files, where # putting all generated files in the same directory would otherwise causes # performance problems for the file system. # The default value is: NO. CREATE_SUBDIRS = NO # If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII # characters to appear in the names of generated files. If set to NO, non-ASCII # characters will be escaped, for example _xE3_x81_x84 will be used for Unicode # U+3044. # The default value is: NO. ALLOW_UNICODE_NAMES = NO # The OUTPUT_LANGUAGE tag is used to specify the language in which all # documentation generated by doxygen is written. Doxygen will use this # information to generate all constant output in the proper language. # Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, # Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), # Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, # Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), # Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, # Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, # Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, # Ukrainian and Vietnamese. # The default value is: English. OUTPUT_LANGUAGE = English # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member # descriptions after the members that are listed in the file and class # documentation (similar to Javadoc). Set to NO to disable this. # The default value is: YES. BRIEF_MEMBER_DESC = YES # If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief # description of a member or function before the detailed description # # Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the # brief descriptions will be completely suppressed. # The default value is: YES. REPEAT_BRIEF = YES # This tag implements a quasi-intelligent brief description abbreviator that is # used to form the text in various listings. Each string in this list, if found # as the leading text of the brief description, will be stripped from the text # and the result, after processing the whole list, is used as the annotated # text. Otherwise, the brief description is used as-is. If left blank, the # following values are used ($name is automatically replaced with the name of # the entity):The $name class, The $name widget, The $name file, is, provides, # specifies, contains, represents, a, an and the. ABBREVIATE_BRIEF = "The $name class" \ "The $name widget" \ "The $name file" \ is \ provides \ specifies \ contains \ represents \ a \ an \ the # If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then # doxygen will generate a detailed section even if there is only a brief # description. # The default value is: NO. ALWAYS_DETAILED_SEC = NO # If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all # inherited members of a class in the documentation of that class as if those # members were ordinary class members. Constructors, destructors and assignment # operators of the base classes will not be shown. # The default value is: NO. INLINE_INHERITED_MEMB = NO # If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path # before files name in the file list and in the header files. If set to NO the # shortest path that makes the file name unique will be used # The default value is: YES. FULL_PATH_NAMES = YES # The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. # Stripping is only done if one of the specified strings matches the left-hand # part of the path. The tag can be used to show relative paths in the file list. # If left blank the directory from which doxygen is run is used as the path to # strip. # # Note that you can specify absolute paths here, but also relative paths, which # will be relative from the directory where doxygen is started. # This tag requires that the tag FULL_PATH_NAMES is set to YES. STRIP_FROM_PATH = # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the # path mentioned in the documentation of a class, which tells the reader which # header file to include in order to use a class. If left blank only the name of # the header file containing the class definition is used. Otherwise one should # specify the list of include paths that are normally passed to the compiler # using the -I flag. STRIP_FROM_INC_PATH = # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but # less readable) file names. This can be useful is your file systems doesn't # support long names like on DOS, Mac, or CD-ROM. # The default value is: NO. SHORT_NAMES = NO # If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the # first line (until the first dot) of a Javadoc-style comment as the brief # description. If set to NO, the Javadoc-style will behave just like regular Qt- # style comments (thus requiring an explicit @brief command for a brief # description.) # The default value is: NO. JAVADOC_AUTOBRIEF = NO # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first # line (until the first dot) of a Qt-style comment as the brief description. If # set to NO, the Qt-style will behave just like regular Qt-style comments (thus # requiring an explicit \brief command for a brief description.) # The default value is: NO. QT_AUTOBRIEF = NO # The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a # multi-line C++ special comment block (i.e. a block of //! or /// comments) as # a brief description. This used to be the default behavior. The new default is # to treat a multi-line C++ comment block as a detailed description. Set this # tag to YES if you prefer the old behavior instead. # # Note that setting this tag to YES also means that rational rose comments are # not recognized any more. # The default value is: NO. MULTILINE_CPP_IS_BRIEF = NO # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the # documentation from any documented member that it re-implements. # The default value is: YES. INHERIT_DOCS = YES # If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new # page for each member. If set to NO, the documentation of a member will be part # of the file/class/namespace that contains it. # The default value is: NO. SEPARATE_MEMBER_PAGES = NO # The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen # uses this value to replace tabs by spaces in code fragments. # Minimum value: 1, maximum value: 16, default value: 4. TAB_SIZE = 4 # This tag can be used to specify a number of aliases that act as commands in # the documentation. An alias has the form: # name=value # For example adding # "sideeffect=@par Side Effects:\n" # will allow you to put the command \sideeffect (or @sideeffect) in the # documentation, which will result in a user-defined paragraph with heading # "Side Effects:". You can put \n's in the value part of an alias to insert # newlines (in the resulting output). You can put ^^ in the value part of an # alias to insert a newline as if a physical newline was in the original file. ALIASES = # This tag can be used to specify a number of word-keyword mappings (TCL only). # A mapping has the form "name=value". For example adding "class=itcl::class" # will allow you to use the command class in the itcl::class meaning. TCL_SUBST = # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources # only. Doxygen will then generate output that is more tailored for C. For # instance, some of the names that are used will be different. The list of all # members will be omitted, etc. # The default value is: NO. OPTIMIZE_OUTPUT_FOR_C = NO # Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or # Python sources only. Doxygen will then generate output that is more tailored # for that language. For instance, namespaces will be presented as packages, # qualified scopes will look different, etc. # The default value is: NO. OPTIMIZE_OUTPUT_JAVA = NO # Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran # sources. Doxygen will then generate output that is tailored for Fortran. # The default value is: NO. OPTIMIZE_FOR_FORTRAN = NO # Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL # sources. Doxygen will then generate output that is tailored for VHDL. # The default value is: NO. OPTIMIZE_OUTPUT_VHDL = NO # Doxygen selects the parser to use depending on the extension of the files it # parses. With this tag you can assign which parser to use for a given # extension. Doxygen has a built-in mapping, but you can override or extend it # using this tag. The format is ext=language, where ext is a file extension, and # language is one of the parsers supported by doxygen: IDL, Java, Javascript, # C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: # FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: # Fortran. In the later case the parser tries to guess whether the code is fixed # or free formatted code, this is the default for Fortran type files), VHDL. For # instance to make doxygen treat .inc files as Fortran files (default is PHP), # and .f files as C (default is Fortran), use: inc=Fortran f=C. # # Note: For files without extension you can use no_extension as a placeholder. # # Note that for custom extensions you also need to set FILE_PATTERNS otherwise # the files are not read by doxygen. EXTENSION_MAPPING = # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments # according to the Markdown format, which allows for more readable # documentation. See http://daringfireball.net/projects/markdown/ for details. # The output of markdown processing is further processed by doxygen, so you can # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in # case of backward compatibilities issues. # The default value is: YES. MARKDOWN_SUPPORT = YES # When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up # to that level are automatically included in the table of contents, even if # they do not have an id attribute. # Note: This feature currently applies only to Markdown headings. # Minimum value: 0, maximum value: 99, default value: 0. # This tag requires that the tag MARKDOWN_SUPPORT is set to YES. TOC_INCLUDE_HEADINGS = 0 # When enabled doxygen tries to link words that correspond to documented # classes, or namespaces to their corresponding documentation. Such a link can # be prevented in individual cases by putting a % sign in front of the word or # globally by setting AUTOLINK_SUPPORT to NO. # The default value is: YES. AUTOLINK_SUPPORT = YES # If you use STL classes (i.e. std::string, std::vector, etc.) but do not want # to include (a tag file for) the STL sources as input, then you should set this # tag to YES in order to let doxygen match functions declarations and # definitions whose arguments contain STL classes (e.g. func(std::string); # versus func(std::string) {}). This also make the inheritance and collaboration # diagrams that involve STL classes more complete and accurate. # The default value is: NO. BUILTIN_STL_SUPPORT = NO # If you use Microsoft's C++/CLI language, you should set this option to YES to # enable parsing support. # The default value is: NO. CPP_CLI_SUPPORT = NO # Set the SIP_SUPPORT tag to YES if your project consists of sip (see: # https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen # will parse them like normal C++ but will assume all classes use public instead # of private inheritance when no explicit protection keyword is present. # The default value is: NO. SIP_SUPPORT = NO # For Microsoft's IDL there are propget and propput attributes to indicate # getter and setter methods for a property. Setting this option to YES will make # doxygen to replace the get and set methods by a property in the documentation. # This will only work if the methods are indeed getting or setting a simple # type. If this is not the case, or you want to show the methods anyway, you # should set this option to NO. # The default value is: YES. IDL_PROPERTY_SUPPORT = YES # If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC # tag is set to YES then doxygen will reuse the documentation of the first # member in the group (if any) for the other members of the group. By default # all members of a group must be documented explicitly. # The default value is: NO. DISTRIBUTE_GROUP_DOC = NO # If one adds a struct or class to a group and this option is enabled, then also # any nested class or struct is added to the same group. By default this option # is disabled and one has to add nested compounds explicitly via \ingroup. # The default value is: NO. GROUP_NESTED_COMPOUNDS = NO # Set the SUBGROUPING tag to YES to allow class member groups of the same type # (for instance a group of public functions) to be put as a subgroup of that # type (e.g. under the Public Functions section). Set it to NO to prevent # subgrouping. Alternatively, this can be done per class using the # \nosubgrouping command. # The default value is: YES. SUBGROUPING = YES # When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions # are shown inside the group in which they are included (e.g. using \ingroup) # instead of on a separate page (for HTML and Man pages) or section (for LaTeX # and RTF). # # Note that this feature does not work in combination with # SEPARATE_MEMBER_PAGES. # The default value is: NO. INLINE_GROUPED_CLASSES = NO # When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions # with only public data fields or simple typedef fields will be shown inline in # the documentation of the scope in which they are defined (i.e. file, # namespace, or group documentation), provided this scope is documented. If set # to NO, structs, classes, and unions are shown on a separate page (for HTML and # Man pages) or section (for LaTeX and RTF). # The default value is: NO. INLINE_SIMPLE_STRUCTS = NO # When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or # enum is documented as struct, union, or enum with the name of the typedef. So # typedef struct TypeS {} TypeT, will appear in the documentation as a struct # with name TypeT. When disabled the typedef will appear as a member of a file, # namespace, or class. And the struct will be named TypeS. This can typically be # useful for C code in case the coding convention dictates that all compound # types are typedef'ed and only the typedef is referenced, never the tag name. # The default value is: NO. TYPEDEF_HIDES_STRUCT = NO # The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This # cache is used to resolve symbols given their name and scope. Since this can be # an expensive process and often the same symbol appears multiple times in the # code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small # doxygen will become slower. If the cache is too large, memory is wasted. The # cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range # is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 # symbols. At the end of a run doxygen will report the cache usage and suggest # the optimal cache size from a speed point of view. # Minimum value: 0, maximum value: 9, default value: 0. LOOKUP_CACHE_SIZE = 0 #--------------------------------------------------------------------------- # Build related configuration options #--------------------------------------------------------------------------- # If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in # documentation are documented, even if no documentation was available. Private # class members and static file members will be hidden unless the # EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. # Note: This will also disable the warnings about undocumented members that are # normally produced when WARNINGS is set to YES. # The default value is: NO. EXTRACT_ALL = NO # If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will # be included in the documentation. # The default value is: NO. EXTRACT_PRIVATE = NO # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal # scope will be included in the documentation. # The default value is: NO. EXTRACT_PACKAGE = NO # If the EXTRACT_STATIC tag is set to YES, all static members of a file will be # included in the documentation. # The default value is: NO. EXTRACT_STATIC = NO # If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined # locally in source files will be included in the documentation. If set to NO, # only classes defined in header files are included. Does not have any effect # for Java sources. # The default value is: YES. EXTRACT_LOCAL_CLASSES = YES # This flag is only useful for Objective-C code. If set to YES, local methods, # which are defined in the implementation section but not in the interface are # included in the documentation. If set to NO, only methods in the interface are # included. # The default value is: NO. EXTRACT_LOCAL_METHODS = NO # If this flag is set to YES, the members of anonymous namespaces will be # extracted and appear in the documentation as a namespace called # 'anonymous_namespace{file}', where file will be replaced with the base name of # the file that contains the anonymous namespace. By default anonymous namespace # are hidden. # The default value is: NO. EXTRACT_ANON_NSPACES = NO # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all # undocumented members inside documented classes or files. If set to NO these # members will be included in the various overviews, but no documentation # section is generated. This option has no effect if EXTRACT_ALL is enabled. # The default value is: NO. HIDE_UNDOC_MEMBERS = NO # If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all # undocumented classes that are normally visible in the class hierarchy. If set # to NO, these classes will be included in the various overviews. This option # has no effect if EXTRACT_ALL is enabled. # The default value is: NO. HIDE_UNDOC_CLASSES = NO # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend # (class|struct|union) declarations. If set to NO, these declarations will be # included in the documentation. # The default value is: NO. HIDE_FRIEND_COMPOUNDS = NO # If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any # documentation blocks found inside the body of a function. If set to NO, these # blocks will be appended to the function's detailed documentation block. # The default value is: NO. HIDE_IN_BODY_DOCS = NO # The INTERNAL_DOCS tag determines if documentation that is typed after a # \internal command is included. If the tag is set to NO then the documentation # will be excluded. Set it to YES to include the internal documentation. # The default value is: NO. INTERNAL_DOCS = NO # If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file # names in lower-case letters. If set to YES, upper-case letters are also # allowed. This is useful if you have classes or files whose names only differ # in case and if your file system supports case sensitive file names. Windows # and Mac users are advised to set this option to NO. # The default value is: system dependent. CASE_SENSE_NAMES = NO # If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with # their full class and namespace scopes in the documentation. If set to YES, the # scope will be hidden. # The default value is: NO. HIDE_SCOPE_NAMES = NO # If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will # append additional text to a page's title, such as Class Reference. If set to # YES the compound reference will be hidden. # The default value is: NO. HIDE_COMPOUND_REFERENCE= NO # If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of # the files that are included by a file in the documentation of that file. # The default value is: YES. SHOW_INCLUDE_FILES = YES # If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each # grouped member an include statement to the documentation, telling the reader # which file to include in order to use the member. # The default value is: NO. SHOW_GROUPED_MEMB_INC = NO # If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include # files with double quotes in the documentation rather than with sharp brackets. # The default value is: NO. FORCE_LOCAL_INCLUDES = NO # If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the # documentation for inline members. # The default value is: YES. INLINE_INFO = YES # If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the # (detailed) documentation of file and class members alphabetically by member # name. If set to NO, the members will appear in declaration order. # The default value is: YES. SORT_MEMBER_DOCS = YES # If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief # descriptions of file, namespace and class members alphabetically by member # name. If set to NO, the members will appear in declaration order. Note that # this will also influence the order of the classes in the class list. # The default value is: NO. SORT_BRIEF_DOCS = NO # If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the # (brief and detailed) documentation of class members so that constructors and # destructors are listed first. If set to NO the constructors will appear in the # respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. # Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief # member documentation. # Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting # detailed member documentation. # The default value is: NO. SORT_MEMBERS_CTORS_1ST = NO # If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy # of group names into alphabetical order. If set to NO the group names will # appear in their defined order. # The default value is: NO. SORT_GROUP_NAMES = NO # If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by # fully-qualified names, including namespaces. If set to NO, the class list will # be sorted only by class name, not including the namespace part. # Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. # Note: This option applies only to the class list, not to the alphabetical # list. # The default value is: NO. SORT_BY_SCOPE_NAME = NO # If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper # type resolution of all parameters of a function it will reject a match between # the prototype and the implementation of a member function even if there is # only one candidate or it is obvious which candidate to choose by doing a # simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still # accept a match between prototype and implementation in such cases. # The default value is: NO. STRICT_PROTO_MATCHING = NO # The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo # list. This list is created by putting \todo commands in the documentation. # The default value is: YES. GENERATE_TODOLIST = YES # The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test # list. This list is created by putting \test commands in the documentation. # The default value is: YES. GENERATE_TESTLIST = YES # The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug # list. This list is created by putting \bug commands in the documentation. # The default value is: YES. GENERATE_BUGLIST = YES # The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) # the deprecated list. This list is created by putting \deprecated commands in # the documentation. # The default value is: YES. GENERATE_DEPRECATEDLIST= YES # The ENABLED_SECTIONS tag can be used to enable conditional documentation # sections, marked by \if ... \endif and \cond # ... \endcond blocks. ENABLED_SECTIONS = # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the # initial value of a variable or macro / define can have for it to appear in the # documentation. If the initializer consists of more lines than specified here # it will be hidden. Use a value of 0 to hide initializers completely. The # appearance of the value of individual variables and macros / defines can be # controlled using \showinitializer or \hideinitializer command in the # documentation regardless of this setting. # Minimum value: 0, maximum value: 10000, default value: 30. MAX_INITIALIZER_LINES = 30 # Set the SHOW_USED_FILES tag to NO to disable the list of files generated at # the bottom of the documentation of classes and structs. If set to YES, the # list will mention the files that were used to generate the documentation. # The default value is: YES. SHOW_USED_FILES = YES # Set the SHOW_FILES tag to NO to disable the generation of the Files page. This # will remove the Files entry from the Quick Index and from the Folder Tree View # (if specified). # The default value is: YES. SHOW_FILES = YES # Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces # page. This will remove the Namespaces entry from the Quick Index and from the # Folder Tree View (if specified). # The default value is: YES. SHOW_NAMESPACES = YES # The FILE_VERSION_FILTER tag can be used to specify a program or script that # doxygen should invoke to get the current version for each file (typically from # the version control system). Doxygen will invoke the program by executing (via # popen()) the command command input-file, where command is the value of the # FILE_VERSION_FILTER tag, and input-file is the name of an input file provided # by doxygen. Whatever the program writes to standard output is used as the file # version. For an example see the documentation. FILE_VERSION_FILTER = # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed # by doxygen. The layout file controls the global structure of the generated # output files in an output format independent way. To create the layout file # that represents doxygen's defaults, run doxygen with the -l option. You can # optionally specify a file name after the option, if omitted DoxygenLayout.xml # will be used as the name of the layout file. # # Note that if you run doxygen from a directory containing a file called # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE # tag is left empty. LAYOUT_FILE = # The CITE_BIB_FILES tag can be used to specify one or more bib files containing # the reference definitions. This must be a list of .bib files. The .bib # extension is automatically appended if omitted. This requires the bibtex tool # to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info. # For LaTeX the style of the bibliography can be controlled using # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the # search path. See also \cite for info how to create references. CITE_BIB_FILES = #--------------------------------------------------------------------------- # Configuration options related to warning and progress messages #--------------------------------------------------------------------------- # The QUIET tag can be used to turn on/off the messages that are generated to # standard output by doxygen. If QUIET is set to YES this implies that the # messages are off. # The default value is: NO. QUIET = NO # The WARNINGS tag can be used to turn on/off the warning messages that are # generated to standard error (stderr) by doxygen. If WARNINGS is set to YES # this implies that the warnings are on. # # Tip: Turn warnings on while writing the documentation. # The default value is: YES. WARNINGS = YES # If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate # warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag # will automatically be disabled. # The default value is: YES. WARN_IF_UNDOCUMENTED = YES # If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for # potential errors in the documentation, such as not documenting some parameters # in a documented function, or documenting parameters that don't exist or using # markup commands wrongly. # The default value is: YES. WARN_IF_DOC_ERROR = YES # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that # are documented, but have no documentation for their parameters or return # value. If set to NO, doxygen will only warn about wrong or incomplete # parameter documentation, but not about the absence of documentation. # The default value is: NO. WARN_NO_PARAMDOC = NO # If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when # a warning is encountered. # The default value is: NO. WARN_AS_ERROR = NO # The WARN_FORMAT tag determines the format of the warning messages that doxygen # can produce. The string should contain the $file, $line, and $text tags, which # will be replaced by the file and line number from which the warning originated # and the warning text. Optionally the format may contain $version, which will # be replaced by the version of the file (if it could be obtained via # FILE_VERSION_FILTER) # The default value is: $file:$line: $text. WARN_FORMAT = "$file:$line: $text" # The WARN_LOGFILE tag can be used to specify a file to which warning and error # messages should be written. If left blank the output is written to standard # error (stderr). WARN_LOGFILE = doxygen_warnings.log #--------------------------------------------------------------------------- # Configuration options related to the input files #--------------------------------------------------------------------------- # The INPUT tag is used to specify the files and/or directories that contain # documented source files. You may enter file names like myfile.cpp or # directories like /usr/src/myproject. Separate the files or directories with # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. INPUT = ../dispenso mainpage.md getting_started.md groups.dox # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses # libiconv (or the iconv built into libc) for the transcoding. See the libiconv # documentation (see: https://www.gnu.org/software/libiconv/) for the list of # possible encodings. # The default value is: UTF-8. INPUT_ENCODING = UTF-8 # If the value of the INPUT tag contains directories, you can use the # FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and # *.h) to filter out the source-files in the directories. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # read by doxygen. # # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, # *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, # *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf. FILE_PATTERNS = *.h \ *.hh \ *.hxx \ *.hpp \ *.h++ \ *.inl \ *.markdown \ *.md \ *.dox # The RECURSIVE tag can be used to specify whether or not subdirectories should # be searched for input files as well. # The default value is: NO. RECURSIVE = NO # The EXCLUDE tag can be used to specify files and/or directories that should be # excluded from the INPUT source files. This way you can easily exclude a # subdirectory from a directory tree whose root is specified with the INPUT tag. # # Note that relative paths are relative to the directory from which doxygen is # run. EXCLUDE = # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded # from the input. # The default value is: NO. EXCLUDE_SYMLINKS = NO # If the value of the INPUT tag contains directories, you can use the # EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude # certain files from those directories. # # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories for example use the pattern */test/* EXCLUDE_PATTERNS = dispenso::detail::* # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the # output. The symbol name can be a fully qualified name, a word, or if the # wildcard * is used, a substring. Examples: ANamespace, AClass, # AClass::ANamespace, ANamespace::*Test # # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories use the pattern */test/* EXCLUDE_SYMBOLS = dispenso::detail::* \ *detail* \ DISPENSO_* \ TaskSetBase \ ChunkedRange \ ForceQueuingTag \ value_type \ reference \ const_reference \ size_type \ difference_type \ reference_type \ const_reference_type \ pointer \ const_pointer \ iterator \ const_iterator \ reverse_iterator \ const_reverse_iterator \ NodeType \ SubgraphType \ OpResult \ ssize_t \ kDefault* \ kImmediate* \ kNewThread* \ kCompleted \ kStatic \ ParentCascadeCancel \ operator== \ operator!= \ operator< \ operator> \ operator<= \ operator>= \ operator* \ "operator new" \ "operator delete" \ swap \ share \ then \ numIncompletePredecessors_ \ numPredecessors_ \ dependsOnOneNode \ DO_PRAGMA \ dispenso::cv::* \ *cv::* \ alloc \ dealloc # The EXAMPLE_PATH tag can be used to specify one or more files or directories # that contain example code fragments that are included (see the \include # command). EXAMPLE_PATH = ../examples # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and # *.h) to filter out the source-files in the directories. If left blank all # files are included. EXAMPLE_PATTERNS = *.cpp # If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be # searched for input files to be used with the \include or \dontinclude commands # irrespective of the value of the RECURSIVE tag. # The default value is: NO. EXAMPLE_RECURSIVE = NO # The IMAGE_PATH tag can be used to specify one or more files or directories # that contain images that are to be included in the documentation (see the # \image command). IMAGE_PATH = # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program # by executing (via popen()) the command: # # # # where is the value of the INPUT_FILTER tag, and is the # name of an input file. Doxygen will then use the output that the filter # program writes to standard output. If FILTER_PATTERNS is specified, this tag # will be ignored. # # Note that the filter must not add or remove lines; it is applied before the # code is scanned, but not when the output code is generated. If lines are added # or removed, the anchors will not be placed correctly. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # properly processed by doxygen. INPUT_FILTER = # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. Doxygen will compare the file name with each pattern and apply the # filter if there is a match. The filters are a list of the form: pattern=filter # (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how # filters are used. If the FILTER_PATTERNS tag is empty or if none of the # patterns match the file name, INPUT_FILTER is applied. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # properly processed by doxygen. FILTER_PATTERNS = # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using # INPUT_FILTER) will also be used to filter the input files that are used for # producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). # The default value is: NO. FILTER_SOURCE_FILES = NO # The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file # pattern. A pattern will override the setting for FILTER_PATTERN (if any) and # it is also possible to disable source filtering for a specific pattern using # *.ext= (so without naming a filter). # This tag requires that the tag FILTER_SOURCE_FILES is set to YES. FILTER_SOURCE_PATTERNS = # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that # is part of the input, its contents will be placed on the main page # (index.html). This can be useful if you have a project on for instance GitHub # and want to reuse the introduction page also for the doxygen output. USE_MDFILE_AS_MAINPAGE = mainpage.md #--------------------------------------------------------------------------- # Configuration options related to source browsing #--------------------------------------------------------------------------- # If the SOURCE_BROWSER tag is set to YES then a list of source files will be # generated. Documented entities will be cross-referenced with these sources. # # Note: To get rid of all source code in the generated output, make sure that # also VERBATIM_HEADERS is set to NO. # The default value is: NO. SOURCE_BROWSER = YES # Setting the INLINE_SOURCES tag to YES will include the body of functions, # classes and enums directly into the documentation. # The default value is: NO. INLINE_SOURCES = NO # Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any # special comment blocks from generated source code fragments. Normal C, C++ and # Fortran comments will always remain visible. # The default value is: YES. STRIP_CODE_COMMENTS = YES # If the REFERENCED_BY_RELATION tag is set to YES then for each documented # function all documented functions referencing it will be listed. # The default value is: NO. REFERENCED_BY_RELATION = NO # If the REFERENCES_RELATION tag is set to YES then for each documented function # all documented entities called/used by that function will be listed. # The default value is: NO. REFERENCES_RELATION = NO # If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set # to YES then the hyperlinks from functions in REFERENCES_RELATION and # REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will # link to the documentation. # The default value is: YES. REFERENCES_LINK_SOURCE = YES # If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the # source code will show a tooltip with additional information such as prototype, # brief description and links to the definition and documentation. Since this # will make the HTML file larger and loading of large files a bit slower, you # can opt to disable this feature. # The default value is: YES. # This tag requires that the tag SOURCE_BROWSER is set to YES. SOURCE_TOOLTIPS = YES # If the USE_HTAGS tag is set to YES then the references to source code will # point to the HTML generated by the htags(1) tool instead of doxygen built-in # source browser. The htags tool is part of GNU's global source tagging system # (see https://www.gnu.org/software/global/global.html). You will need version # 4.8.6 or higher. # # To use it do the following: # - Install the latest version of global # - Enable SOURCE_BROWSER and USE_HTAGS in the config file # - Make sure the INPUT points to the root of the source tree # - Run doxygen as normal # # Doxygen will invoke htags (and that will in turn invoke gtags), so these # tools must be available from the command line (i.e. in the search path). # # The result: instead of the source browser generated by doxygen, the links to # source code will now point to the output of htags. # The default value is: NO. # This tag requires that the tag SOURCE_BROWSER is set to YES. USE_HTAGS = NO # If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a # verbatim copy of the header file for each class for which an include is # specified. Set to NO to disable this. # See also: Section \class. # The default value is: YES. VERBATIM_HEADERS = YES #--------------------------------------------------------------------------- # Configuration options related to the alphabetical class index #--------------------------------------------------------------------------- # If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all # compounds will be generated. Enable this if the project contains a lot of # classes, structs, unions or interfaces. # The default value is: YES. ALPHABETICAL_INDEX = YES # The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in # which the alphabetical index list will be split. # Minimum value: 1, maximum value: 20, default value: 5. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. COLS_IN_ALPHA_INDEX = 5 # In case all classes in a project start with a common prefix, all classes will # be put under the same header in the alphabetical index. The IGNORE_PREFIX tag # can be used to specify a prefix (or a list of prefixes) that should be ignored # while generating the index headers. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. IGNORE_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the HTML output #--------------------------------------------------------------------------- # If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output # The default value is: YES. GENERATE_HTML = YES # The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of # it. # The default directory is: html. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_OUTPUT = html # The HTML_FILE_EXTENSION tag can be used to specify the file extension for each # generated HTML page (for example: .htm, .php, .asp). # The default value is: .html. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FILE_EXTENSION = .html # The HTML_HEADER tag can be used to specify a user-defined HTML header file for # each generated HTML page. If the tag is left blank doxygen will generate a # standard header. # # To get valid HTML the header file that includes any scripts and style sheets # that doxygen needs, which is dependent on the configuration options used (e.g. # the setting GENERATE_TREEVIEW). It is highly recommended to start with a # default header using # doxygen -w html new_header.html new_footer.html new_stylesheet.css # YourConfigFile # and then modify the file new_header.html. See also section "Doxygen usage" # for information on how to generate the default header that doxygen normally # uses. # Note: The header is subject to change so you typically have to regenerate the # default header when upgrading to a newer version of doxygen. For a description # of the possible markers and block names see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_HEADER = header.html # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each # generated HTML page. If the tag is left blank doxygen will generate a standard # footer. See HTML_HEADER for more information on how to generate a default # footer and what special commands can be used inside the footer. See also # section "Doxygen usage" for information on how to generate the default footer # that doxygen normally uses. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FOOTER = # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style # sheet that is used by each HTML page. It can be used to fine-tune the look of # the HTML output. If left blank doxygen will generate a default style sheet. # See also section "Doxygen usage" for information on how to generate the style # sheet that doxygen normally uses. # Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as # it is more robust and this tag (HTML_STYLESHEET) will in the future become # obsolete. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_STYLESHEET = # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined # cascading style sheets that are included after the standard style sheets # created by doxygen. Using this option one can overrule certain style aspects. # This is preferred over using HTML_STYLESHEET since it does not replace the # standard style sheet and is therefore more robust against future updates. # Doxygen will copy the style sheet files to the output directory. # Note: The order of the extra style sheet files is of importance (e.g. the last # style sheet in the list overrules the setting of the previous ones in the # list). For an example see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_EXTRA_STYLESHEET = third-party/doxygen-awesome/doxygen-awesome.css custom.css # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note # that these files will be copied to the base HTML output directory. Use the # $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these # files. In the HTML_STYLESHEET file, use the file name only. Also note that the # files will be copied as-is; there are no commands or markers available. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_EXTRA_FILES = third-party/doxygen-awesome/doxygen-awesome-darkmode-toggle.js # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen # will adjust the colors in the style sheet and background images according to # this color. Hue is specified as an angle on a colorwheel, see # https://en.wikipedia.org/wiki/Hue for more information. For instance the value # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 # purple, and 360 is red again. # Minimum value: 0, maximum value: 359, default value: 220. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_HUE = 124 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors # in the HTML output. For a value of 0 the output will use grayscales only. A # value of 255 will produce the most vivid colors. # Minimum value: 0, maximum value: 255, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_SAT = 100 # The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the # luminance component of the colors in the HTML output. Values below 100 # gradually make the output lighter, whereas values above 100 make the output # darker. The value divided by 100 is the actual gamma applied, so 80 represents # a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not # change the gamma. # Minimum value: 40, maximum value: 240, default value: 80. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_GAMMA = 80 # If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML # page will contain the date and time when the page was generated. Setting this # to YES can help to show when doxygen was last run and thus if the # documentation is up to date. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_TIMESTAMP = NO # If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML # documentation will contain a main index with vertical navigation menus that # are dynamically created via Javascript. If disabled, the navigation index will # consists of multiple levels of tabs that are statically embedded in every HTML # page. Disable this option to support browsers that do not have Javascript, # like the Qt help browser. # The default value is: YES. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_DYNAMIC_MENUS = YES # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML # documentation will contain sections that can be hidden and shown after the # page has loaded. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_DYNAMIC_SECTIONS = NO # With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries # shown in the various tree structured indices initially; the user can expand # and collapse entries dynamically later on. Doxygen will expand the tree to # such a level that at most the specified number of entries are visible (unless # a fully collapsed tree already exceeds this amount). So setting the number of # entries 1 will produce a full collapsed tree by default. 0 is a special value # representing an infinite number of entries and will result in a full expanded # tree by default. # Minimum value: 0, maximum value: 9999, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_INDEX_NUM_ENTRIES = 100 # If the GENERATE_DOCSET tag is set to YES, additional index files will be # generated that can be used as input for Apple's Xcode 3 integrated development # environment (see: https://developer.apple.com/tools/xcode/), introduced with # OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a # Makefile in the HTML output directory. Running make will produce the docset in # that directory and running make install will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at # startup. See https://developer.apple.com/tools/creatingdocsetswithdoxygen.html # for more information. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_DOCSET = NO # This tag determines the name of the docset feed. A documentation feed provides # an umbrella under which multiple documentation sets from a single provider # (such as a company or product suite) can be grouped. # The default value is: Doxygen generated docs. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_FEEDNAME = "Doxygen generated docs" # This tag specifies a string that should uniquely identify the documentation # set bundle. This should be a reverse domain-name style string, e.g. # com.mycompany.MyDocSet. Doxygen will append .docset to the name. # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_BUNDLE_ID = org.doxygen.Project # The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify # the documentation publisher. This should be a reverse domain-name style # string, e.g. com.mycompany.MyDocSet.documentation. # The default value is: org.doxygen.Publisher. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_PUBLISHER_ID = org.doxygen.Publisher # The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. # The default value is: Publisher. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_PUBLISHER_NAME = Publisher # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three # additional HTML index files: index.hhp, index.hhc, and index.hhk. The # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop # (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on # Windows. # # The HTML Help Workshop contains a compiler that can convert all HTML output # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML # files are now used as the Windows 98 help format, and will replace the old # Windows help format (.hlp) on all Windows platforms in the future. Compressed # HTML files also contain an index, a table of contents, and you can search for # words in the documentation. The HTML workshop also contains a viewer for # compressed HTML files. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_HTMLHELP = NO # The CHM_FILE tag can be used to specify the file name of the resulting .chm # file. You can add a path in front of the file if the result should not be # written to the html output directory. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. CHM_FILE = # The HHC_LOCATION tag can be used to specify the location (absolute path # including file name) of the HTML help compiler (hhc.exe). If non-empty, # doxygen will try to run the HTML help compiler on the generated index.hhp. # The file has to be specified with full path. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. HHC_LOCATION = # The GENERATE_CHI flag controls if a separate .chi index file is generated # (YES) or that it should be included in the master .chm file (NO). # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. GENERATE_CHI = NO # The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) # and project file content. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. CHM_INDEX_ENCODING = # The BINARY_TOC flag controls whether a binary table of contents is generated # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it # enables the Previous and Next buttons. # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. BINARY_TOC = NO # The TOC_EXPAND flag can be set to YES to add extra items for group members to # the table of contents of the HTML help documentation and to the tree view. # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. TOC_EXPAND = NO # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and # QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that # can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help # (.qch) of the generated HTML documentation. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_QHP = NO # If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify # the file name of the resulting .qch file. The path specified is relative to # the HTML output folder. # This tag requires that the tag GENERATE_QHP is set to YES. QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace # (see: http://doc.qt.io/qt-4.8/qthelpproject.html#namespace). # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_QHP is set to YES. QHP_NAMESPACE = org.doxygen.Project # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt # Help Project output. For more information please see Qt Help Project / Virtual # Folders (see: http://doc.qt.io/qt-4.8/qthelpproject.html#virtual-folders). # The default value is: doc. # This tag requires that the tag GENERATE_QHP is set to YES. QHP_VIRTUAL_FOLDER = doc # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom # filter to add. For more information please see Qt Help Project / Custom # Filters (see: http://doc.qt.io/qt-4.8/qthelpproject.html#custom-filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom # Filters (see: http://doc.qt.io/qt-4.8/qthelpproject.html#custom-filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's filter section matches. Qt Help Project / Filter Attributes (see: # http://doc.qt.io/qt-4.8/qthelpproject.html#filter-attributes). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_SECT_FILTER_ATTRS = # The QHG_LOCATION tag can be used to specify the location of Qt's # qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the # generated .qhp file. # This tag requires that the tag GENERATE_QHP is set to YES. QHG_LOCATION = # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be # generated, together with the HTML files, they form an Eclipse help plugin. To # install this plugin and make it available under the help contents menu in # Eclipse, the contents of the directory containing the HTML and XML files needs # to be copied into the plugins directory of eclipse. The name of the directory # within the plugins directory should be the same as the ECLIPSE_DOC_ID value. # After copying Eclipse needs to be restarted before the help appears. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_ECLIPSEHELP = NO # A unique identifier for the Eclipse help plugin. When installing the plugin # the directory name containing the HTML and XML files should also have this # name. Each documentation set should have its own identifier. # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. ECLIPSE_DOC_ID = org.doxygen.Project # If you want full control over the layout of the generated HTML pages it might # be necessary to disable the index and replace it with your own. The # DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top # of each HTML page. A value of NO enables the index and the value YES disables # it. Since the tabs in the index contain the same information as the navigation # tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. DISABLE_INDEX = NO # The GENERATE_TREEVIEW tag is used to specify whether a tree-like index # structure should be generated to display hierarchical information. If the tag # value is set to YES, a side panel will be generated containing a tree-like # index structure (just like the one that is generated for HTML Help). For this # to work a browser that supports JavaScript, DHTML, CSS and frames is required # (i.e. any modern browser). Windows users are probably better off using the # HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can # further fine-tune the look of the index. As an example, the default style # sheet generated by doxygen has an example that shows how to put an image at # the root of the tree instead of the PROJECT_NAME. Since the tree basically has # the same information as the tab index, you could consider setting # DISABLE_INDEX to YES when enabling this option. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_TREEVIEW = YES # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that # doxygen will group on one line in the generated HTML documentation. # # Note that a value of 0 will completely suppress the enum values from appearing # in the overview section. # Minimum value: 0, maximum value: 20, default value: 4. # This tag requires that the tag GENERATE_HTML is set to YES. ENUM_VALUES_PER_LINE = 4 # If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used # to set the initial width (in pixels) of the frame in which the tree is shown. # Minimum value: 0, maximum value: 1500, default value: 250. # This tag requires that the tag GENERATE_HTML is set to YES. TREEVIEW_WIDTH = 250 # If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to # external symbols imported via tag files in a separate window. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. EXT_LINKS_IN_WINDOW = NO # Use this tag to change the font size of LaTeX formulas included as images in # the HTML documentation. When you change the font size after a successful # doxygen run you need to manually remove any form_*.png images from the HTML # output directory to force them to be regenerated. # Minimum value: 8, maximum value: 50, default value: 10. # This tag requires that the tag GENERATE_HTML is set to YES. FORMULA_FONTSIZE = 10 # Use the FORMULA_TRANSPARENT tag to determine whether or not the images # generated for formulas are transparent PNGs. Transparent PNGs are not # supported properly for IE 6.0, but are supported on all modern browsers. # # Note that when changing this option you need to delete any form_*.png files in # the HTML output directory before the changes have effect. # The default value is: YES. # This tag requires that the tag GENERATE_HTML is set to YES. FORMULA_TRANSPARENT = YES # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see # https://www.mathjax.org) which uses client side Javascript for the rendering # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX # installed or if you want to formulas look prettier in the HTML output. When # enabled you may also need to install MathJax separately and configure the path # to it using the MATHJAX_RELPATH option. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. USE_MATHJAX = NO # When MathJax is enabled you can set the default output format to be used for # the MathJax output. See the MathJax site (see: # http://docs.mathjax.org/en/latest/output.html) for more details. # Possible values are: HTML-CSS (which is slower, but has the best # compatibility), NativeMML (i.e. MathML) and SVG. # The default value is: HTML-CSS. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_FORMAT = HTML-CSS # When MathJax is enabled you need to specify the location relative to the HTML # output directory using the MATHJAX_RELPATH option. The destination directory # should contain the MathJax.js script. For instance, if the mathjax directory # is located at the same level as the HTML output directory, then # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax # Content Delivery Network so you can quickly see the result without installing # MathJax. However, it is strongly recommended to install a local copy of # MathJax from https://www.mathjax.org before deployment. # The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_RELPATH = https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/ # The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax # extension names that should be enabled during MathJax rendering. For example # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_EXTENSIONS = # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces # of code that will be used on startup of the MathJax code. See the MathJax site # (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an # example see the documentation. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_CODEFILE = # When the SEARCHENGINE tag is enabled doxygen will generate a search box for # the HTML output. The underlying search engine uses javascript and DHTML and # should work on any modern browser. Note that when using HTML help # (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) # there is already a search function so this one should typically be disabled. # For large projects the javascript based search engine can be slow, then # enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to # search using the keyboard; to jump to the search box use + S # (what the is depends on the OS and browser, but it is typically # , /